diff --git a/SECURITY.md b/SECURITY.md new file mode 100644 index 0000000..c09ac3d --- /dev/null +++ b/SECURITY.md @@ -0,0 +1,40 @@ +# iText Security Policy + +## Reporting a Vulnerability + +We are committed to maintaining the security of our software. If you discover a security vulnerability, we encourage you to report it to us as soon as possible. + +To report a vulnerability, please visit our [Vulnerability Reporting Page](https://itextpdf.com/report-vulnerability), or email [vulnerability@apryse.com](vulnerability@apryse.com). If you do not receive a response in 2 business days, please follow up as we may not have received your message. + +We follow the procedure of Coordinated Vulnerability Disclosure (CVD) and, to protect the ecosystem, we request that those reporting do the same. Please visit the above page for more information, and follow the steps below to ensure that your report is handled promptly and appropriately: + +1. **Do not disclose the vulnerability publicly** until we have had a chance to address it. +2. **Provide a detailed description** of the vulnerability, including steps to reproduce it, if possible. +3. **Include any relevant information** such as the version of pdfOCR you are using, your operating system, and any other pertinent details. + +## Security Updates and Patches + + When a vulnerability is reported, we will: + +1. **Investigate and verify** the vulnerability. +2. **Develop and test** a fix for the vulnerability. +3. **Release a patch** as soon as possible. + + +## Known Vulnerabilities + +The iText Knowledge Base has a page for known [Common Vulnerabilities and Exposures](https://kb.itextpdf.com/itext/cves) (CVEs), please check it to ensure your vulnerability has not already been disclosed or addressed. + +## Supported product lines + +See [Compatibility Matrix](https://kb.itextpdf.com/itext/compatibility-matrix) + +## Security Best Practices + +To help ensure the security of your applications using pdfOCR, we recommend the following best practices: + +1. **Keep pdfOCR up to date** by regularly checking for and applying updates. +2. **Review and follow** our security guidelines for secure usage. +3. **Monitor your applications** for any unusual activity and investigate any anomalies promptly. + +Thank you for helping us keep iText secure! diff --git a/doxyfile b/doxyfile index 06dcdbe..d7884f8 100644 --- a/doxyfile +++ b/doxyfile @@ -32,7 +32,7 @@ DOXYFILE_ENCODING = UTF-8 # title of most generated pages and in a few other places. # The default value is: My Project. -PROJECT_NAME = "pdfOCR 3.0.2 API" +PROJECT_NAME = "pdfOCR 4.0.0 API" # The PROJECT_NUMBER tag can be used to enter a project or revision number. This # could be handy for archiving the generated documentation or if some version @@ -485,7 +485,7 @@ EXTRACT_PRIV_VIRTUAL = NO # scope will be included in the documentation. # The default value is: NO. -EXTRACT_PACKAGE = NO +EXTRACT_PACKAGE = YES # If the EXTRACT_STATIC tag is set to YES, all static members of a file will be # included in the documentation. diff --git a/itext.tests/itext.pdfocr.api.tests/Properties/AssemblyInfo.cs b/itext.tests/itext.pdfocr.api.tests/Properties/AssemblyInfo.cs index 9628aa2..f474793 100644 --- a/itext.tests/itext.pdfocr.api.tests/Properties/AssemblyInfo.cs +++ b/itext.tests/itext.pdfocr.api.tests/Properties/AssemblyInfo.cs @@ -15,6 +15,6 @@ [assembly: Guid("d6a6ea97-1f23-448f-b700-eff62971d234")] -[assembly: AssemblyVersion("3.0.2.0")] -[assembly: AssemblyFileVersion("3.0.2.0")] -[assembly: AssemblyInformationalVersion("3.0.2")] +[assembly: AssemblyVersion("4.0.0.0")] +[assembly: AssemblyFileVersion("4.0.0.0")] +[assembly: AssemblyInformationalVersion("4.0.0")] diff --git a/itext.tests/itext.pdfocr.api.tests/itext.pdfocr.api.tests.csproj b/itext.tests/itext.pdfocr.api.tests/itext.pdfocr.api.tests.csproj index d4f8850..cc7e738 100644 --- a/itext.tests/itext.pdfocr.api.tests/itext.pdfocr.api.tests.csproj +++ b/itext.tests/itext.pdfocr.api.tests/itext.pdfocr.api.tests.csproj @@ -25,9 +25,9 @@ - + - + diff --git a/itext.tests/itext.pdfocr.api.tests/itext/pdfocr/ApiTest.cs b/itext.tests/itext.pdfocr.api.tests/itext/pdfocr/ApiTest.cs index f66ea7d..f8be803 100644 --- a/itext.tests/itext.pdfocr.api.tests/itext/pdfocr/ApiTest.cs +++ b/itext.tests/itext.pdfocr.api.tests/itext/pdfocr/ApiTest.cs @@ -103,9 +103,9 @@ public virtual void CreatePdfAFileWithFileTest() { String contentBytes = iText.Commons.Utils.JavaUtil.GetStringForBytes(pdf.GetPage(1).GetContentBytes(), System.Text.Encoding .UTF8); NUnit.Framework.Assert.IsTrue(contentBytes.Contains("<00190014001c001400150014>")); - PdfAConformanceLevel cl = pdf.GetReader().GetPdfAConformanceLevel(); - NUnit.Framework.Assert.AreEqual(PdfAConformanceLevel.PDF_A_3U.GetConformance(), cl.GetConformance()); - NUnit.Framework.Assert.AreEqual(PdfAConformanceLevel.PDF_A_3U.GetPart(), cl.GetPart()); + PdfAConformance cl = pdf.GetReader().GetPdfConformance().GetAConformance(); + NUnit.Framework.Assert.AreEqual(PdfAConformance.PDF_A_3U.GetLevel(), cl.GetLevel()); + NUnit.Framework.Assert.AreEqual(PdfAConformance.PDF_A_3U.GetPart(), cl.GetPart()); } } @@ -120,9 +120,9 @@ public virtual void CreatePdfAFileWithFileNoMetaTest() { String contentBytes = iText.Commons.Utils.JavaUtil.GetStringForBytes(pdf.GetPage(1).GetContentBytes(), System.Text.Encoding .UTF8); NUnit.Framework.Assert.IsTrue(contentBytes.Contains("<00190014001c001400150014>")); - PdfAConformanceLevel cl = pdf.GetReader().GetPdfAConformanceLevel(); - NUnit.Framework.Assert.AreEqual(PdfAConformanceLevel.PDF_A_3U.GetConformance(), cl.GetConformance()); - NUnit.Framework.Assert.AreEqual(PdfAConformanceLevel.PDF_A_3U.GetPart(), cl.GetPart()); + PdfAConformance cl = pdf.GetReader().GetPdfConformance().GetAConformance(); + NUnit.Framework.Assert.AreEqual(PdfAConformance.PDF_A_3U.GetLevel(), cl.GetLevel()); + NUnit.Framework.Assert.AreEqual(PdfAConformance.PDF_A_3U.GetPart(), cl.GetPart()); } } @@ -167,7 +167,7 @@ public virtual void TestThaiImageWithNotDefGlyphs() { [NUnit.Framework.Test] public virtual void TestImageRotationHandler() { - NUnit.Framework.Assert.That(() => { + Exception exception = NUnit.Framework.Assert.Catch(typeof(Exception), () => { OcrPdfCreatorProperties properties = new OcrPdfCreatorProperties(); properties.SetImageRotationHandler(new ApiTest.NotImplementedImageRotationHandler()); String testName = "testSetAndGetImageRotationHandler"; @@ -176,13 +176,13 @@ public virtual void TestImageRotationHandler() { PdfHelper.CreatePdf(pdfPath, new FileInfo(path), properties); NUnit.Framework.Assert.IsNotNull(properties.GetImageRotationHandler()); } - , NUnit.Framework.Throws.InstanceOf().With.Message.EqualTo("applyRotation is not implemented")) -; + ); + NUnit.Framework.Assert.AreEqual("applyRotation is not implemented", exception.Message); } [NUnit.Framework.Test] public virtual void TestImageRotationHandlerForTiff() { - NUnit.Framework.Assert.That(() => { + Exception exception = NUnit.Framework.Assert.Catch(typeof(Exception), () => { OcrPdfCreatorProperties properties = new OcrPdfCreatorProperties(); properties.SetImageRotationHandler(new ApiTest.NotImplementedImageRotationHandler()); String testName = "testSetAndGetImageRotationHandler"; @@ -191,8 +191,8 @@ public virtual void TestImageRotationHandlerForTiff() { PdfHelper.CreatePdf(pdfPath, new FileInfo(path), properties); NUnit.Framework.Assert.IsNotNull(properties.GetImageRotationHandler()); } - , NUnit.Framework.Throws.InstanceOf().With.Message.EqualTo("applyRotation is not implemented")) -; + ); + NUnit.Framework.Assert.AreEqual("applyRotation is not implemented", exception.Message); } [NUnit.Framework.Test] @@ -215,21 +215,21 @@ public virtual void TestTableStructureTree() { } [NUnit.Framework.Test] - [LogMessage(PdfOcrExceptionMessageConstant.CANNOT_CREATE_PDF_DOCUMENT, LogLevel = LogLevelConstants.ERROR)] public virtual void TestTaggingNotSupported() { String input = PdfHelper.GetImagesTestDirectory() + "numbers_01.jpg"; String pdfPath = PdfHelper.GetTargetDirectory() + "taggingNotSupported.pdf"; Exception e = NUnit.Framework.Assert.Catch(typeof(PdfOcrException), () => PdfHelper.CreatePdf(pdfPath, new FileInfo(input), new OcrPdfCreatorProperties().SetTagged(true))); - NUnit.Framework.Assert.AreEqual(MessageFormatUtil.Format(PdfOcrExceptionMessageConstant.CANNOT_CREATE_PDF_DOCUMENT - , PdfOcrExceptionMessageConstant.TAGGING_IS_NOT_SUPPORTED), e.Message); + NUnit.Framework.Assert.AreEqual(PdfOcrExceptionMessageConstant.TAGGING_IS_NOT_SUPPORTED, e.Message); } +//\cond DO_NOT_DOCUMENT internal class NotImplementedImageRotationHandler : IImageRotationHandler { public virtual ImageData ApplyRotation(ImageData imageData) { throw new Exception("applyRotation is not implemented"); } } +//\endcond private class DummyMetaInfo : IMetaInfo { } diff --git a/itext.tests/itext.pdfocr.api.tests/itext/pdfocr/PdfA3uTest.cs b/itext.tests/itext.pdfocr.api.tests/itext/pdfocr/PdfA3uTest.cs index a736190..ca462eb 100644 --- a/itext.tests/itext.pdfocr.api.tests/itext/pdfocr/PdfA3uTest.cs +++ b/itext.tests/itext.pdfocr.api.tests/itext/pdfocr/PdfA3uTest.cs @@ -54,7 +54,7 @@ public virtual void TestPdfA3uWithNullIntent() { [NUnit.Framework.Test] public virtual void TestIncompatibleOutputIntentAndFontColorSpaceException() { - NUnit.Framework.Assert.That(() => { + Exception exception = NUnit.Framework.Assert.Catch(typeof(PdfException), () => { String testName = "testIncompatibleOutputIntentAndFontColorSpaceException"; String path = PdfHelper.GetDefaultImagePath(); String pdfPath = PdfHelper.GetTargetDirectory() + testName + ".pdf"; @@ -64,8 +64,9 @@ public virtual void TestIncompatibleOutputIntentAndFontColorSpaceException() { PdfHelper.CreatePdfA(pdfPath, new FileInfo(path), ocrPdfCreatorProperties, PdfHelper.GetRGBPdfOutputIntent ()); } - , NUnit.Framework.Throws.InstanceOf().With.Message.EqualTo(PdfAConformanceException.DEVICECMYK_MAY_BE_USED_ONLY_IF_THE_FILE_HAS_A_CMYK_PDFA_OUTPUT_INTENT_OR_DEFAULTCMYK_IN_USAGE_CONTEXT)) -; + ); + NUnit.Framework.Assert.AreEqual(PdfaExceptionMessageConstant.DEVICECMYK_MAY_BE_USED_ONLY_IF_THE_FILE_HAS_A_CMYK_PDFA_OUTPUT_INTENT_OR_DEFAULTCMYK_IN_USAGE_CONTEXT + , exception.Message); } [NUnit.Framework.Test] @@ -81,7 +82,7 @@ public virtual void TestPdfA3DefaultMetadata() { PdfDocument pdfDocument = new PdfDocument(new PdfReader(pdfPath)); NUnit.Framework.Assert.AreEqual("en-US", pdfDocument.GetCatalog().GetLang().ToString()); NUnit.Framework.Assert.AreEqual(null, pdfDocument.GetDocumentInfo().GetTitle()); - NUnit.Framework.Assert.AreEqual(PdfAConformanceLevel.PDF_A_3U, pdfDocument.GetReader().GetPdfAConformanceLevel + NUnit.Framework.Assert.AreEqual(PdfAConformance.PDF_A_3U, pdfDocument.GetReader().GetPdfConformance().GetAConformance ()); pdfDocument.Close(); } @@ -102,7 +103,7 @@ public virtual void TestPdfCustomMetadata() { PdfDocument pdfDocument = new PdfDocument(new PdfReader(pdfPath)); NUnit.Framework.Assert.AreEqual(locale, pdfDocument.GetCatalog().GetLang().ToString()); NUnit.Framework.Assert.AreEqual(title, pdfDocument.GetDocumentInfo().GetTitle()); - NUnit.Framework.Assert.AreEqual(PdfAConformanceLevel.PDF_A_3U, pdfDocument.GetReader().GetPdfAConformanceLevel + NUnit.Framework.Assert.AreEqual(PdfAConformance.PDF_A_3U, pdfDocument.GetReader().GetPdfConformance().GetAConformance ()); pdfDocument.Close(); } @@ -110,7 +111,7 @@ public virtual void TestPdfCustomMetadata() { [LogMessage(PdfOcrExceptionMessageConstant.CANNOT_CREATE_PDF_DOCUMENT, Count = 1)] [NUnit.Framework.Test] public virtual void TestNonCompliantThaiPdfA() { - NUnit.Framework.Assert.That(() => { + Exception exception = NUnit.Framework.Assert.Catch(typeof(PdfOcrException), () => { String testName = "testNonCompliantThaiPdfA"; String path = PdfHelper.GetThaiImagePath(); String pdfPath = PdfHelper.GetTargetDirectory() + testName + ".pdf"; @@ -120,8 +121,10 @@ public virtual void TestNonCompliantThaiPdfA() { PdfHelper.CreatePdfA(pdfPath, new FileInfo(path), ocrPdfCreatorProperties, PdfHelper.GetRGBPdfOutputIntent ()); } - , NUnit.Framework.Throws.InstanceOf().With.Message.EqualTo(MessageFormatUtil.Format(PdfOcrExceptionMessageConstant.CANNOT_CREATE_PDF_DOCUMENT, MessageFormatUtil.Format(PdfOcrLogMessageConstant.COULD_NOT_FIND_CORRESPONDING_GLYPH_TO_UNICODE_CHARACTER, 3611)))) -; + ); + NUnit.Framework.Assert.AreEqual(MessageFormatUtil.Format(PdfOcrExceptionMessageConstant.CANNOT_CREATE_PDF_DOCUMENT + , MessageFormatUtil.Format(PdfOcrLogMessageConstant.COULD_NOT_FIND_CORRESPONDING_GLYPH_TO_UNICODE_CHARACTER + , 3611)), exception.Message); } [NUnit.Framework.Test] @@ -153,15 +156,16 @@ public virtual void TestCompliantThaiPdfA() { [LogMessage(PdfOcrExceptionMessageConstant.CANNOT_CREATE_PDF_DOCUMENT, Count = 1)] [NUnit.Framework.Test] public virtual void TestPdfACreateWithoutPdfLangProperty() { - NUnit.Framework.Assert.That(() => { + Exception exception = NUnit.Framework.Assert.Catch(typeof(PdfOcrException), () => { String testName = "testPdfACreateWithoutPdfLangProperty"; String path = PdfHelper.GetThaiImagePath(); String pdfPath = PdfHelper.GetTargetDirectory() + testName + ".pdf"; PdfHelper.CreatePdfA(pdfPath, new FileInfo(path), new OcrPdfCreatorProperties(), PdfHelper.GetRGBPdfOutputIntent ()); } - , NUnit.Framework.Throws.InstanceOf().With.Message.EqualTo(MessageFormatUtil.Format(PdfOcrExceptionMessageConstant.CANNOT_CREATE_PDF_DOCUMENT, PdfOcrLogMessageConstant.PDF_LANGUAGE_PROPERTY_IS_NOT_SET))) -; + ); + NUnit.Framework.Assert.AreEqual(MessageFormatUtil.Format(PdfOcrExceptionMessageConstant.CANNOT_CREATE_PDF_DOCUMENT + , PdfOcrLogMessageConstant.PDF_LANGUAGE_PROPERTY_IS_NOT_SET), exception.Message); } } } diff --git a/itext.tests/itext.pdfocr.api.tests/itext/pdfocr/PdfCreatorUtilTest.cs b/itext.tests/itext.pdfocr.api.tests/itext/pdfocr/PdfCreatorUtilTest.cs index 564aae6..bdf9d12 100644 --- a/itext.tests/itext.pdfocr.api.tests/itext/pdfocr/PdfCreatorUtilTest.cs +++ b/itext.tests/itext.pdfocr.api.tests/itext/pdfocr/PdfCreatorUtilTest.cs @@ -20,6 +20,7 @@ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the You should have received a copy of the GNU Affero General Public License along with this program. If not, see . */ +using System; using System.Collections.Generic; using System.IO; using iText.Commons.Utils; @@ -70,21 +71,17 @@ public virtual void GetImageDataFromValidNotTiffTest() { [NUnit.Framework.Test] [LogMessage(PdfOcrLogMessageConstant.CANNOT_READ_INPUT_IMAGE)] public virtual void GetImageDataFromNotExistingImageTest() { - NUnit.Framework.Assert.That(() => { - PdfCreatorUtil.GetImageData(new FileInfo("no such path"), null); - } - , NUnit.Framework.Throws.InstanceOf()) -; + NUnit.Framework.Assert.Catch(typeof(PdfOcrInputException), () => PdfCreatorUtil.GetImageData(new FileInfo( + "no such path"), null)); } [NUnit.Framework.Test] [LogMessage(PdfOcrLogMessageConstant.CANNOT_READ_INPUT_IMAGE)] public virtual void GetImageDataFromInvalidImageTest() { - NUnit.Framework.Assert.That(() => { - PdfCreatorUtil.GetImageData(new FileInfo(PdfHelper.GetImagesTestDirectory() + "corrupted.jpg"), null); - } - , NUnit.Framework.Throws.InstanceOf().With.Message.EqualTo(MessageFormatUtil.Format(PdfOcrExceptionMessageConstant.CANNOT_READ_INPUT_IMAGE))) -; + Exception exception = NUnit.Framework.Assert.Catch(typeof(PdfOcrInputException), () => PdfCreatorUtil.GetImageData + (new FileInfo(PdfHelper.GetImagesTestDirectory() + "corrupted.jpg"), null)); + NUnit.Framework.Assert.AreEqual(MessageFormatUtil.Format(PdfOcrExceptionMessageConstant.CANNOT_READ_INPUT_IMAGE + ), exception.Message); } } } diff --git a/itext.tests/itext.pdfocr.api.tests/itext/pdfocr/PdfFontTest.cs b/itext.tests/itext.pdfocr.api.tests/itext/pdfocr/PdfFontTest.cs index 8fe29c6..5f5ec62 100644 --- a/itext.tests/itext.pdfocr.api.tests/itext/pdfocr/PdfFontTest.cs +++ b/itext.tests/itext.pdfocr.api.tests/itext/pdfocr/PdfFontTest.cs @@ -57,7 +57,7 @@ public virtual void TestFontColor() { [LogMessage(PdfOcrExceptionMessageConstant.CANNOT_CREATE_PDF_DOCUMENT, Count = 1)] [NUnit.Framework.Test] public virtual void TestInvalidFontWithInvalidDefaultFontFamily() { - NUnit.Framework.Assert.That(() => { + Exception exception = NUnit.Framework.Assert.Catch(typeof(PdfOcrException), () => { String testName = "testInvalidFontWithInvalidDefaultFontFamily"; String path = PdfHelper.GetDefaultImagePath(); String pdfPath = PdfHelper.GetTargetDirectory() + testName + ".pdf"; @@ -72,8 +72,9 @@ public virtual void TestInvalidFontWithInvalidDefaultFontFamily() { NUnit.Framework.Assert.AreEqual(PdfHelper.DEFAULT_TEXT, result); NUnit.Framework.Assert.AreEqual(ScaleMode.SCALE_TO_FIT, properties.GetScaleMode()); } - , NUnit.Framework.Throws.InstanceOf().With.Message.EqualTo(MessageFormatUtil.Format(PdfOcrExceptionMessageConstant.CANNOT_CREATE_PDF_DOCUMENT, PdfOcrExceptionMessageConstant.CANNOT_RESOLVE_PROVIDED_FONTS))) -; + ); + NUnit.Framework.Assert.AreEqual(MessageFormatUtil.Format(PdfOcrExceptionMessageConstant.CANNOT_CREATE_PDF_DOCUMENT + , PdfOcrExceptionMessageConstant.CANNOT_RESOLVE_PROVIDED_FONTS), exception.Message); } [NUnit.Framework.Test] diff --git a/itext.tests/itext.pdfocr.api.tests/itext/pdfocr/helpers/CustomOcrEngine.cs b/itext.tests/itext.pdfocr.api.tests/itext/pdfocr/helpers/CustomOcrEngine.cs index 957efe6..00ad5f1 100644 --- a/itext.tests/itext.pdfocr.api.tests/itext/pdfocr/helpers/CustomOcrEngine.cs +++ b/itext.tests/itext.pdfocr.api.tests/itext/pdfocr/helpers/CustomOcrEngine.cs @@ -61,6 +61,10 @@ public virtual void CreateTxtFile(IList inputImages, FileInfo txtFile, ) { } + public virtual bool IsTaggingSupported() { + return false; + } + public virtual OcrEngineProperties GetOcrEngineProperties() { return ocrEngineProperties; } diff --git a/itext.tests/itext.pdfocr.api.tests/itext/pdfocr/helpers/CustomProductAwareOcrEngine.cs b/itext.tests/itext.pdfocr.api.tests/itext/pdfocr/helpers/CustomProductAwareOcrEngine.cs index 0dcca43..20ec990 100644 --- a/itext.tests/itext.pdfocr.api.tests/itext/pdfocr/helpers/CustomProductAwareOcrEngine.cs +++ b/itext.tests/itext.pdfocr.api.tests/itext/pdfocr/helpers/CustomProductAwareOcrEngine.cs @@ -50,6 +50,10 @@ public virtual void CreateTxtFile(IList inputImages, FileInfo txtFile, ) { } + public virtual bool IsTaggingSupported() { + return true; + } + public virtual OcrEngineProperties GetOcrEngineProperties() { return null; } diff --git a/itext.tests/itext.pdfocr.api.tests/itext/pdfocr/helpers/PdfHelper.cs b/itext.tests/itext.pdfocr.api.tests/itext/pdfocr/helpers/PdfHelper.cs index ebba8f9..ea26cb9 100644 --- a/itext.tests/itext.pdfocr.api.tests/itext/pdfocr/helpers/PdfHelper.cs +++ b/itext.tests/itext.pdfocr.api.tests/itext/pdfocr/helpers/PdfHelper.cs @@ -84,12 +84,13 @@ public static String GetTargetDirectory() { /// Create pdfWriter using provided path to destination file. public static PdfWriter GetPdfWriter(String pdfPath) { - return new PdfWriter(pdfPath, new WriterProperties().AddUAXmpMetadata()); + return new PdfWriter(pdfPath, new WriterProperties().AddPdfUaXmpMetadata(PdfUAConformance.PDF_UA_1)); } /// Create pdfWriter. public static PdfWriter GetPdfWriter() { - return new PdfWriter(new MemoryStream(), new WriterProperties().AddUAXmpMetadata()); + return new PdfWriter(new MemoryStream(), new WriterProperties().AddPdfUaXmpMetadata(PdfUAConformance.PDF_UA_1 + )); } /// Creates PDF rgb output intent for tests. diff --git a/itext.tests/itext.pdfocr.api.tests/itext/pdfocr/helpers/TestStructureDetectionOcrEngine.cs b/itext.tests/itext.pdfocr.api.tests/itext/pdfocr/helpers/TestStructureDetectionOcrEngine.cs index 7970f00..c98ba98 100644 --- a/itext.tests/itext.pdfocr.api.tests/itext/pdfocr/helpers/TestStructureDetectionOcrEngine.cs +++ b/itext.tests/itext.pdfocr.api.tests/itext/pdfocr/helpers/TestStructureDetectionOcrEngine.cs @@ -85,5 +85,9 @@ public virtual void CreateTxtFile(IList inputImages, FileInfo txtFile) public virtual void CreateTxtFile(IList inputImages, FileInfo txtFile, OcrProcessContext ocrProcessContext ) { } + + public virtual bool IsTaggingSupported() { + return true; + } } } diff --git a/itext.tests/itext.pdfocr.tesseract4.tests/Properties/AssemblyInfo.cs b/itext.tests/itext.pdfocr.tesseract4.tests/Properties/AssemblyInfo.cs index 3e21341..516b10a 100644 --- a/itext.tests/itext.pdfocr.tesseract4.tests/Properties/AssemblyInfo.cs +++ b/itext.tests/itext.pdfocr.tesseract4.tests/Properties/AssemblyInfo.cs @@ -15,6 +15,6 @@ [assembly: Guid("d6a6ea97-1f23-448f-b700-eff62971d234")] -[assembly: AssemblyVersion("3.0.2.0")] -[assembly: AssemblyFileVersion("3.0.2.0")] -[assembly: AssemblyInformationalVersion("3.0.2")] +[assembly: AssemblyVersion("4.0.0.0")] +[assembly: AssemblyFileVersion("4.0.0.0")] +[assembly: AssemblyInformationalVersion("4.0.0")] diff --git a/itext.tests/itext.pdfocr.tesseract4.tests/itext.pdfocr.tesseract4.tests.csproj b/itext.tests/itext.pdfocr.tesseract4.tests/itext.pdfocr.tesseract4.tests.csproj index 831a886..740d748 100644 --- a/itext.tests/itext.pdfocr.tesseract4.tests/itext.pdfocr.tesseract4.tests.csproj +++ b/itext.tests/itext.pdfocr.tesseract4.tests/itext.pdfocr.tesseract4.tests.csproj @@ -26,9 +26,9 @@ - + - + diff --git a/itext.tests/itext.pdfocr.tesseract4.tests/itext/pdfocr/IntegrationTestHelper.cs b/itext.tests/itext.pdfocr.tesseract4.tests/itext/pdfocr/IntegrationTestHelper.cs index 59d313b..20d17d8 100644 --- a/itext.tests/itext.pdfocr.tesseract4.tests/itext/pdfocr/IntegrationTestHelper.cs +++ b/itext.tests/itext.pdfocr.tesseract4.tests/itext/pdfocr/IntegrationTestHelper.cs @@ -398,7 +398,7 @@ protected internal virtual String GetTextFromTextFile(FileInfo file) { /// Create pdfWriter using provided path to destination file. protected internal virtual PdfWriter GetPdfWriter(String pdfPath) { - return new PdfWriter(pdfPath, new WriterProperties().AddUAXmpMetadata()); + return new PdfWriter(pdfPath, new WriterProperties().AddPdfUaXmpMetadata(PdfUAConformance.PDF_UA_1)); } /// Gets image name from path. diff --git a/itext.tests/itext.pdfocr.tesseract4.tests/itext/pdfocr/TesseractExecutableIntegrationTest.cs b/itext.tests/itext.pdfocr.tesseract4.tests/itext/pdfocr/TesseractExecutableIntegrationTest.cs index 16f0783..fc1be83 100644 --- a/itext.tests/itext.pdfocr.tesseract4.tests/itext/pdfocr/TesseractExecutableIntegrationTest.cs +++ b/itext.tests/itext.pdfocr.tesseract4.tests/itext/pdfocr/TesseractExecutableIntegrationTest.cs @@ -20,6 +20,7 @@ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the You should have received a copy of the GNU Affero General Public License along with this program. If not, see . */ +using System; using System.IO; using iText.Pdfocr.Tesseract4; using iText.Pdfocr.Tesseract4.Exceptions; @@ -32,39 +33,38 @@ public class TesseractExecutableIntegrationTest : IntegrationTestHelper { [LogMessage(PdfOcrTesseract4ExceptionMessageConstant.CANNOT_FIND_PATH_TO_TESSERACT_EXECUTABLE, Count = 1)] [NUnit.Framework.Test] public virtual void TestNullPathToTesseractExecutable() { - NUnit.Framework.Assert.That(() => { - FileInfo file = new FileInfo(TEST_IMAGES_DIRECTORY + "spanish_01.jpg"); + FileInfo file = new FileInfo(TEST_IMAGES_DIRECTORY + "spanish_01.jpg"); + Exception exception = NUnit.Framework.Assert.Catch(typeof(PdfOcrTesseract4Exception), () => { Tesseract4ExecutableOcrEngine tesseractExecutableReader = new Tesseract4ExecutableOcrEngine(new Tesseract4OcrEngineProperties ()); tesseractExecutableReader.SetPathToExecutable(null); GetTextFromPdf(tesseractExecutableReader, file); } - , NUnit.Framework.Throws.InstanceOf().With.Message.EqualTo(PdfOcrTesseract4ExceptionMessageConstant.CANNOT_FIND_PATH_TO_TESSERACT_EXECUTABLE)) -; + ); + NUnit.Framework.Assert.AreEqual(PdfOcrTesseract4ExceptionMessageConstant.CANNOT_FIND_PATH_TO_TESSERACT_EXECUTABLE + , exception.Message); } [LogMessage(PdfOcrTesseract4ExceptionMessageConstant.CANNOT_FIND_PATH_TO_TESSERACT_EXECUTABLE, Count = 1)] [NUnit.Framework.Test] public virtual void TestEmptyPathToTesseractExecutable() { - NUnit.Framework.Assert.That(() => { - FileInfo file = new FileInfo(TEST_IMAGES_DIRECTORY + "spanish_01.jpg"); - GetTextFromPdf(new Tesseract4ExecutableOcrEngine("", new Tesseract4OcrEngineProperties()), file); - } - , NUnit.Framework.Throws.InstanceOf().With.Message.EqualTo(PdfOcrTesseract4ExceptionMessageConstant.CANNOT_FIND_PATH_TO_TESSERACT_EXECUTABLE)) -; + FileInfo file = new FileInfo(TEST_IMAGES_DIRECTORY + "spanish_01.jpg"); + Exception exception = NUnit.Framework.Assert.Catch(typeof(PdfOcrTesseract4Exception), () => GetTextFromPdf + (new Tesseract4ExecutableOcrEngine("", new Tesseract4OcrEngineProperties()), file)); + NUnit.Framework.Assert.AreEqual(PdfOcrTesseract4ExceptionMessageConstant.CANNOT_FIND_PATH_TO_TESSERACT_EXECUTABLE + , exception.Message); } [LogMessage(Tesseract4LogMessageConstant.COMMAND_FAILED, Count = 1)] [LogMessage(PdfOcrTesseract4ExceptionMessageConstant.TESSERACT_NOT_FOUND, Count = 1)] [NUnit.Framework.Test] public virtual void TestIncorrectPathToTesseractExecutable() { - NUnit.Framework.Assert.That(() => { - FileInfo file = new FileInfo(TEST_IMAGES_DIRECTORY + "spanish_01.jpg"); - GetTextFromPdf(new Tesseract4ExecutableOcrEngine("path\\to\\executable\\", new Tesseract4OcrEngineProperties - ()), file); - } - , NUnit.Framework.Throws.InstanceOf().With.Message.EqualTo(PdfOcrTesseract4ExceptionMessageConstant.TESSERACT_NOT_FOUND)) -; + FileInfo file = new FileInfo(TEST_IMAGES_DIRECTORY + "spanish_01.jpg"); + Exception exception = NUnit.Framework.Assert.Catch(typeof(PdfOcrTesseract4Exception), () => GetTextFromPdf + (new Tesseract4ExecutableOcrEngine("path\\to\\executable\\", new Tesseract4OcrEngineProperties()), file + )); + NUnit.Framework.Assert.AreEqual(PdfOcrTesseract4ExceptionMessageConstant.TESSERACT_NOT_FOUND, exception.Message + ); } } } diff --git a/itext.tests/itext.pdfocr.tesseract4.tests/itext/pdfocr/general/BasicTesseractIntegrationTest.cs b/itext.tests/itext.pdfocr.tesseract4.tests/itext/pdfocr/general/BasicTesseractIntegrationTest.cs index eb96c00..8753fa8 100644 --- a/itext.tests/itext.pdfocr.tesseract4.tests/itext/pdfocr/general/BasicTesseractIntegrationTest.cs +++ b/itext.tests/itext.pdfocr.tesseract4.tests/itext/pdfocr/general/BasicTesseractIntegrationTest.cs @@ -38,7 +38,9 @@ You should have received a copy of the GNU Affero General Public License namespace iText.Pdfocr.General { public abstract class BasicTesseractIntegrationTest : IntegrationTestHelper { +//\cond DO_NOT_DOCUMENT internal AbstractTesseract4OcrEngine tesseractReader; +//\endcond public BasicTesseractIntegrationTest(IntegrationTestHelper.ReaderType type) { tesseractReader = GetTesseractReader(type); @@ -122,7 +124,7 @@ public virtual void TestImageWithoutText() { [LogMessage(Tesseract4LogMessageConstant.CANNOT_READ_INPUT_IMAGE, Count = 1)] [NUnit.Framework.Test] public virtual void TestInputInvalidImage() { - NUnit.Framework.Assert.That(() => { + Exception exception = NUnit.Framework.Assert.Catch(typeof(PdfOcrTesseract4Exception), () => { FileInfo file1 = new FileInfo(TEST_IMAGES_DIRECTORY + "example.txt"); FileInfo file2 = new FileInfo(TEST_IMAGES_DIRECTORY + "example_05_corrupted.bmp"); FileInfo file3 = new FileInfo(TEST_IMAGES_DIRECTORY + "numbers_02.jpg"); @@ -131,8 +133,9 @@ public virtual void TestInputInvalidImage() { OcrPdfCreator ocrPdfCreator = new OcrPdfCreator(tesseractReader); ocrPdfCreator.CreatePdf(JavaUtil.ArraysAsList(file3, file1, file2, file3), GetPdfWriter()); } - , NUnit.Framework.Throws.InstanceOf().With.Message.EqualTo(MessageFormatUtil.Format(PdfOcrTesseract4ExceptionMessageConstant.CANNOT_READ_PROVIDED_IMAGE, new FileInfo(TEST_IMAGES_DIRECTORY + "example.txt").FullName))) -; + ); + NUnit.Framework.Assert.AreEqual(MessageFormatUtil.Format(PdfOcrTesseract4ExceptionMessageConstant.CANNOT_READ_PROVIDED_IMAGE + , new FileInfo(TEST_IMAGES_DIRECTORY + "example.txt").FullName), exception.Message); } [NUnit.Framework.Test] @@ -155,32 +158,34 @@ public virtual void TestNonAsciiImageName() { [NUnit.Framework.Test] public virtual void TestNullPathToTessData() { - NUnit.Framework.Assert.That(() => { + Exception exception = NUnit.Framework.Assert.Catch(typeof(PdfOcrTesseract4Exception), () => { FileInfo file = new FileInfo(TEST_IMAGES_DIRECTORY + "spanish_01.jpg"); tesseractReader.SetTesseract4OcrEngineProperties(tesseractReader.GetTesseract4OcrEngineProperties().SetPathToTessData (null)); GetTextFromPdf(tesseractReader, file, JavaCollectionsUtil.SingletonList("eng")); } - , NUnit.Framework.Throws.InstanceOf().With.Message.EqualTo(PdfOcrTesseract4ExceptionMessageConstant.PATH_TO_TESS_DATA_DIRECTORY_IS_INVALID)) -; + ); + NUnit.Framework.Assert.AreEqual(PdfOcrTesseract4ExceptionMessageConstant.PATH_TO_TESS_DATA_DIRECTORY_IS_INVALID + , exception.Message); } [NUnit.Framework.Test] public virtual void TestPathToTessDataWithoutData() { - NUnit.Framework.Assert.That(() => { + Exception exception = NUnit.Framework.Assert.Catch(typeof(PdfOcrTesseract4Exception), () => { FileInfo file = new FileInfo(TEST_IMAGES_DIRECTORY + "spanish_01.jpg"); tesseractReader.SetTesseract4OcrEngineProperties(tesseractReader.GetTesseract4OcrEngineProperties().SetPathToTessData (new FileInfo("test/"))); GetTextFromPdf(tesseractReader, file, JavaCollectionsUtil.SingletonList("eng")); } - , NUnit.Framework.Throws.InstanceOf().With.Message.EqualTo(PdfOcrTesseract4ExceptionMessageConstant.PATH_TO_TESS_DATA_DIRECTORY_IS_INVALID)) -; + ); + NUnit.Framework.Assert.AreEqual(PdfOcrTesseract4ExceptionMessageConstant.PATH_TO_TESS_DATA_DIRECTORY_IS_INVALID + , exception.Message); } [LogMessage(PdfOcrTesseract4ExceptionMessageConstant.INCORRECT_LANGUAGE)] [NUnit.Framework.Test] public virtual void TestEmptyPathToTessData() { - NUnit.Framework.Assert.That(() => { + Exception exception = NUnit.Framework.Assert.Catch(typeof(PdfOcrTesseract4Exception), () => { FileInfo file = new FileInfo(TEST_IMAGES_DIRECTORY + "spanish_01.jpg"); Tesseract4OcrEngineProperties properties = tesseractReader.GetTesseract4OcrEngineProperties().SetPathToTessData (new FileInfo(".")); @@ -189,56 +194,57 @@ public virtual void TestEmptyPathToTessData() { NUnit.Framework.Assert.AreEqual(new FileInfo("").FullName, tesseractReader.GetTesseract4OcrEngineProperties ().GetPathToTessData().FullName); } - , NUnit.Framework.Throws.InstanceOf().With.Message.EqualTo(MessageFormatUtil.Format(PdfOcrTesseract4ExceptionMessageConstant.INCORRECT_LANGUAGE, "eng.traineddata", new FileInfo(".").FullName))) -; + ); + NUnit.Framework.Assert.AreEqual(MessageFormatUtil.Format(PdfOcrTesseract4ExceptionMessageConstant.INCORRECT_LANGUAGE + , "eng.traineddata", new FileInfo(".").FullName), exception.Message); } [LogMessage(PdfOcrTesseract4ExceptionMessageConstant.INCORRECT_LANGUAGE, Count = 1)] [NUnit.Framework.Test] public virtual void TestIncorrectLanguage() { - NUnit.Framework.Assert.That(() => { - FileInfo file = new FileInfo(TEST_IMAGES_DIRECTORY + "spanish_01.jpg"); - GetTextFromPdf(tesseractReader, file, JavaCollectionsUtil.SingletonList("spa_new")); - } - , NUnit.Framework.Throws.InstanceOf().With.Message.EqualTo(MessageFormatUtil.Format(PdfOcrTesseract4ExceptionMessageConstant.INCORRECT_LANGUAGE, "spa_new.traineddata", new FileInfo(LANG_TESS_DATA_DIRECTORY).FullName))) -; + FileInfo file = new FileInfo(TEST_IMAGES_DIRECTORY + "spanish_01.jpg"); + Exception exception = NUnit.Framework.Assert.Catch(typeof(PdfOcrTesseract4Exception), () => GetTextFromPdf + (tesseractReader, file, JavaCollectionsUtil.SingletonList("spa_new"))); + NUnit.Framework.Assert.AreEqual(MessageFormatUtil.Format(PdfOcrTesseract4ExceptionMessageConstant.INCORRECT_LANGUAGE + , "spa_new.traineddata", new FileInfo(LANG_TESS_DATA_DIRECTORY).FullName), exception.Message); } [LogMessage(PdfOcrTesseract4ExceptionMessageConstant.INCORRECT_LANGUAGE, Count = 1)] [NUnit.Framework.Test] public virtual void TestListOfLanguagesWithOneIncorrectLanguage() { - NUnit.Framework.Assert.That(() => { - FileInfo file = new FileInfo(TEST_IMAGES_DIRECTORY + "spanish_01.jpg"); - GetTextFromPdf(tesseractReader, file, JavaUtil.ArraysAsList("spa", "spa_new", "spa_old")); - } - , NUnit.Framework.Throws.InstanceOf().With.Message.EqualTo(MessageFormatUtil.Format(PdfOcrTesseract4ExceptionMessageConstant.INCORRECT_LANGUAGE, "spa_new.traineddata", new FileInfo(LANG_TESS_DATA_DIRECTORY).FullName))) -; + FileInfo file = new FileInfo(TEST_IMAGES_DIRECTORY + "spanish_01.jpg"); + Exception exception = NUnit.Framework.Assert.Catch(typeof(PdfOcrTesseract4Exception), () => GetTextFromPdf + (tesseractReader, file, JavaUtil.ArraysAsList("spa", "spa_new", "spa_old"))); + NUnit.Framework.Assert.AreEqual(MessageFormatUtil.Format(PdfOcrTesseract4ExceptionMessageConstant.INCORRECT_LANGUAGE + , "spa_new.traineddata", new FileInfo(LANG_TESS_DATA_DIRECTORY).FullName), exception.Message); } [LogMessage(PdfOcrTesseract4ExceptionMessageConstant.INCORRECT_LANGUAGE, Count = 1)] [NUnit.Framework.Test] public virtual void TestIncorrectScriptsName() { - NUnit.Framework.Assert.That(() => { + Exception exception = NUnit.Framework.Assert.Catch(typeof(PdfOcrTesseract4Exception), () => { FileInfo file = new FileInfo(TEST_IMAGES_DIRECTORY + "spanish_01.jpg"); tesseractReader.SetTesseract4OcrEngineProperties(tesseractReader.GetTesseract4OcrEngineProperties().SetPathToTessData (new FileInfo(SCRIPT_TESS_DATA_DIRECTORY))); GetTextFromPdf(tesseractReader, file, JavaCollectionsUtil.SingletonList("English")); } - , NUnit.Framework.Throws.InstanceOf().With.Message.EqualTo(MessageFormatUtil.Format(PdfOcrTesseract4ExceptionMessageConstant.INCORRECT_LANGUAGE, "English.traineddata", new FileInfo(SCRIPT_TESS_DATA_DIRECTORY).FullName))) -; + ); + NUnit.Framework.Assert.AreEqual(MessageFormatUtil.Format(PdfOcrTesseract4ExceptionMessageConstant.INCORRECT_LANGUAGE + , "English.traineddata", new FileInfo(SCRIPT_TESS_DATA_DIRECTORY).FullName), exception.Message); } [LogMessage(PdfOcrTesseract4ExceptionMessageConstant.INCORRECT_LANGUAGE, Count = 1)] [NUnit.Framework.Test] public virtual void TestListOfScriptsWithOneIncorrect() { - NUnit.Framework.Assert.That(() => { + Exception exception = NUnit.Framework.Assert.Catch(typeof(PdfOcrTesseract4Exception), () => { FileInfo file = new FileInfo(TEST_IMAGES_DIRECTORY + "spanish_01.jpg"); tesseractReader.SetTesseract4OcrEngineProperties(tesseractReader.GetTesseract4OcrEngineProperties().SetPathToTessData (new FileInfo(SCRIPT_TESS_DATA_DIRECTORY))); GetTextFromPdf(tesseractReader, file, JavaUtil.ArraysAsList("Georgian", "Japanese", "English")); } - , NUnit.Framework.Throws.InstanceOf().With.Message.EqualTo(MessageFormatUtil.Format(PdfOcrTesseract4ExceptionMessageConstant.INCORRECT_LANGUAGE, "English.traineddata", new FileInfo(SCRIPT_TESS_DATA_DIRECTORY).FullName))) -; + ); + NUnit.Framework.Assert.AreEqual(MessageFormatUtil.Format(PdfOcrTesseract4ExceptionMessageConstant.INCORRECT_LANGUAGE + , "English.traineddata", new FileInfo(SCRIPT_TESS_DATA_DIRECTORY).FullName), exception.Message); } [NUnit.Framework.Test] @@ -321,7 +327,8 @@ private String GetTextFromPage(IList pageText) { /// Create pdfWriter. private PdfWriter GetPdfWriter() { - return new PdfWriter(new ByteArrayOutputStream(), new WriterProperties().AddUAXmpMetadata()); + return new PdfWriter(new ByteArrayOutputStream(), new WriterProperties().AddPdfUaXmpMetadata(PdfUAConformance + .PDF_UA_1)); } } } diff --git a/itext.tests/itext.pdfocr.tesseract4.tests/itext/pdfocr/imageformats/ImageFormatIntegrationTest.cs b/itext.tests/itext.pdfocr.tesseract4.tests/itext/pdfocr/imageformats/ImageFormatIntegrationTest.cs index d618cd9..d786ac0 100644 --- a/itext.tests/itext.pdfocr.tesseract4.tests/itext/pdfocr/imageformats/ImageFormatIntegrationTest.cs +++ b/itext.tests/itext.pdfocr.tesseract4.tests/itext/pdfocr/imageformats/ImageFormatIntegrationTest.cs @@ -33,9 +33,13 @@ You should have received a copy of the GNU Affero General Public License namespace iText.Pdfocr.Imageformats { public abstract class ImageFormatIntegrationTest : IntegrationTestHelper { +//\cond DO_NOT_DOCUMENT internal AbstractTesseract4OcrEngine tesseractReader; +//\endcond +//\cond DO_NOT_DOCUMENT internal String testType; +//\endcond public ImageFormatIntegrationTest(IntegrationTestHelper.ReaderType type) { tesseractReader = GetTesseractReader(type); @@ -224,12 +228,11 @@ public virtual void TestInputMultipagesTIFFWithoutPreprocessing() { [LogMessage(Tesseract4LogMessageConstant.CANNOT_READ_INPUT_IMAGE, Count = 1)] [NUnit.Framework.Test] public virtual void TestInputWrongFormat() { - NUnit.Framework.Assert.That(() => { - FileInfo file = new FileInfo(TEST_IMAGES_DIRECTORY + "wierdwords.gif"); - GetTextFromPdf(tesseractReader, file); - } - , NUnit.Framework.Throws.InstanceOf().With.Message.EqualTo(MessageFormatUtil.Format(PdfOcrTesseract4ExceptionMessageConstant.INCORRECT_INPUT_IMAGE_FORMAT, "wierdwords.gif"))) -; + FileInfo file = new FileInfo(TEST_IMAGES_DIRECTORY + "wierdwords.gif"); + Exception exception = NUnit.Framework.Assert.Catch(typeof(PdfOcrTesseract4Exception), () => GetTextFromPdf + (tesseractReader, file)); + NUnit.Framework.Assert.AreEqual(MessageFormatUtil.Format(PdfOcrTesseract4ExceptionMessageConstant.INCORRECT_INPUT_IMAGE_FORMAT + , "wierdwords.gif"), exception.Message); } [NUnit.Framework.Test] diff --git a/itext.tests/itext.pdfocr.tesseract4.tests/itext/pdfocr/pdfa3u/PdfA3UIntegrationTest.cs b/itext.tests/itext.pdfocr.tesseract4.tests/itext/pdfocr/pdfa3u/PdfA3UIntegrationTest.cs index e204f6f..729c429 100644 --- a/itext.tests/itext.pdfocr.tesseract4.tests/itext/pdfocr/pdfa3u/PdfA3UIntegrationTest.cs +++ b/itext.tests/itext.pdfocr.tesseract4.tests/itext/pdfocr/pdfa3u/PdfA3UIntegrationTest.cs @@ -37,7 +37,9 @@ public abstract class PdfA3UIntegrationTest : IntegrationTestHelper { // path to default rgb color profile private static readonly String DEFAULT_RGB_COLOR_PROFILE_PATH = TEST_DIRECTORY + "profiles/sRGB_CS_profile.icm"; +//\cond DO_NOT_DOCUMENT internal AbstractTesseract4OcrEngine tesseractReader; +//\endcond public PdfA3UIntegrationTest(IntegrationTestHelper.ReaderType type) { tesseractReader = GetTesseractReader(type); diff --git a/itext.tests/itext.pdfocr.tesseract4.tests/itext/pdfocr/pdflayers/PdfLayersIntegrationTest.cs b/itext.tests/itext.pdfocr.tesseract4.tests/itext/pdfocr/pdflayers/PdfLayersIntegrationTest.cs index 96ac6ad..b913ece 100644 --- a/itext.tests/itext.pdfocr.tesseract4.tests/itext/pdfocr/pdflayers/PdfLayersIntegrationTest.cs +++ b/itext.tests/itext.pdfocr.tesseract4.tests/itext/pdfocr/pdflayers/PdfLayersIntegrationTest.cs @@ -31,7 +31,9 @@ You should have received a copy of the GNU Affero General Public License namespace iText.Pdfocr.Pdflayers { public abstract class PdfLayersIntegrationTest : IntegrationTestHelper { +//\cond DO_NOT_DOCUMENT internal AbstractTesseract4OcrEngine tesseractReader; +//\endcond public PdfLayersIntegrationTest(IntegrationTestHelper.ReaderType type) { tesseractReader = GetTesseractReader(type); diff --git a/itext.tests/itext.pdfocr.tesseract4.tests/itext/pdfocr/tessdata/TessDataIntegrationLibTest.cs b/itext.tests/itext.pdfocr.tesseract4.tests/itext/pdfocr/tessdata/TessDataIntegrationLibTest.cs index 3c58c5a..9579a4f 100644 --- a/itext.tests/itext.pdfocr.tesseract4.tests/itext/pdfocr/tessdata/TessDataIntegrationLibTest.cs +++ b/itext.tests/itext.pdfocr.tesseract4.tests/itext/pdfocr/tessdata/TessDataIntegrationLibTest.cs @@ -41,14 +41,11 @@ public TessDataIntegrationLibTest() )] [NUnit.Framework.Test] public virtual void TestTessDataWithNonAsciiPath() { - NUnit.Framework.Assert.That(() => { - // Throws exception for the tesseract lib test - DoOcrAndGetTextUsingTessDataByNonAsciiPath(); - NUnit.Framework.Assert.Fail("Should throw exception for the tesseract lib when tess data path contains non ASCII characters" - ); - } - , NUnit.Framework.Throws.InstanceOf().With.Message.EqualTo(PdfOcrTesseract4ExceptionMessageConstant.PATH_TO_TESS_DATA_DIRECTORY_CONTAINS_NON_ASCII_CHARACTERS)) -; + Exception exception = NUnit.Framework.Assert.Catch(typeof(PdfOcrTesseract4Exception), () => + // Throws exception for the tesseract lib test + DoOcrAndGetTextUsingTessDataByNonAsciiPath()); + NUnit.Framework.Assert.AreEqual(PdfOcrTesseract4ExceptionMessageConstant.PATH_TO_TESS_DATA_DIRECTORY_CONTAINS_NON_ASCII_CHARACTERS + , exception.Message); } #if !NETSTANDARD2_0 diff --git a/itext.tests/itext.pdfocr.tesseract4.tests/itext/pdfocr/tessdata/TessDataIntegrationTest.cs b/itext.tests/itext.pdfocr.tesseract4.tests/itext/pdfocr/tessdata/TessDataIntegrationTest.cs index b40c7b0..36a7597 100644 --- a/itext.tests/itext.pdfocr.tesseract4.tests/itext/pdfocr/tessdata/TessDataIntegrationTest.cs +++ b/itext.tests/itext.pdfocr.tesseract4.tests/itext/pdfocr/tessdata/TessDataIntegrationTest.cs @@ -39,9 +39,13 @@ public abstract class TessDataIntegrationTest : IntegrationTestHelper { private static readonly ILogger LOGGER = ITextLogManager.GetLogger(typeof(iText.Pdfocr.Tessdata.TessDataIntegrationTest )); +//\cond DO_NOT_DOCUMENT internal AbstractTesseract4OcrEngine tesseractReader; +//\endcond +//\cond DO_NOT_DOCUMENT internal String testFileTypeName; +//\endcond private bool isExecutableReaderType; diff --git a/itext.tests/itext.pdfocr.tesseract4.tests/itext/pdfocr/tesseract4/ApiTest.cs b/itext.tests/itext.pdfocr.tesseract4.tests/itext/pdfocr/tesseract4/ApiTest.cs index b1f5638..71313f9 100644 --- a/itext.tests/itext.pdfocr.tesseract4.tests/itext/pdfocr/tesseract4/ApiTest.cs +++ b/itext.tests/itext.pdfocr.tesseract4.tests/itext/pdfocr/tesseract4/ApiTest.cs @@ -25,6 +25,7 @@ You should have received a copy of the GNU Affero General Public License using System.IO; using iText.Commons.Utils; using iText.Pdfocr; +using iText.Pdfocr.Exceptions; using iText.Pdfocr.Tesseract4.Exceptions; using iText.Pdfocr.Tesseract4.Logs; using iText.Test.Attributes; @@ -34,42 +35,45 @@ public class ApiTest : IntegrationTestHelper { [LogMessage(PdfOcrTesseract4ExceptionMessageConstant.PATH_TO_TESS_DATA_IS_NOT_SET)] [NUnit.Framework.Test] public virtual void TestDefaultTessDataPathValidationForLib() { - NUnit.Framework.Assert.That(() => { + Exception exception = NUnit.Framework.Assert.Catch(typeof(PdfOcrTesseract4Exception), () => { String path = TEST_IMAGES_DIRECTORY + "numbers_01.jpg"; FileInfo imgFile = new FileInfo(path); Tesseract4LibOcrEngine engine = new Tesseract4LibOcrEngine(new Tesseract4OcrEngineProperties()); engine.DoImageOcr(imgFile); } - , NUnit.Framework.Throws.InstanceOf().With.Message.EqualTo(PdfOcrTesseract4ExceptionMessageConstant.PATH_TO_TESS_DATA_IS_NOT_SET)) -; + ); + NUnit.Framework.Assert.AreEqual(PdfOcrTesseract4ExceptionMessageConstant.PATH_TO_TESS_DATA_IS_NOT_SET, exception + .Message); } [LogMessage(PdfOcrTesseract4ExceptionMessageConstant.PATH_TO_TESS_DATA_IS_NOT_SET)] [NUnit.Framework.Test] public virtual void TestDefaultTessDataPathValidationForExecutable() { - NUnit.Framework.Assert.That(() => { + Exception exception = NUnit.Framework.Assert.Catch(typeof(PdfOcrTesseract4Exception), () => { String path = TEST_IMAGES_DIRECTORY + "numbers_01.jpg"; FileInfo imgFile = new FileInfo(path); Tesseract4ExecutableOcrEngine engine = new Tesseract4ExecutableOcrEngine(new Tesseract4OcrEngineProperties ()); engine.DoImageOcr(imgFile); } - , NUnit.Framework.Throws.InstanceOf().With.Message.EqualTo(PdfOcrTesseract4ExceptionMessageConstant.PATH_TO_TESS_DATA_IS_NOT_SET)) -; + ); + NUnit.Framework.Assert.AreEqual(PdfOcrTesseract4ExceptionMessageConstant.PATH_TO_TESS_DATA_IS_NOT_SET, exception + .Message); } [LogMessage(Tesseract4LogMessageConstant.CANNOT_READ_INPUT_IMAGE, Count = 2)] [NUnit.Framework.Test] public virtual void TestDoTesseractOcrForIncorrectImageForExecutable() { - NUnit.Framework.Assert.That(() => { + Exception exception = NUnit.Framework.Assert.Catch(typeof(PdfOcrTesseract4Exception), () => { String path = TEST_IMAGES_DIRECTORY + "numbers_01"; FileInfo imgFile = new FileInfo(path); Tesseract4ExecutableOcrEngine engine = new Tesseract4ExecutableOcrEngine(new Tesseract4OcrEngineProperties ().SetPathToTessData(GetTessDataDirectory())); engine.DoTesseractOcr(imgFile, null, OutputFormat.HOCR); } - , NUnit.Framework.Throws.InstanceOf().With.Message.EqualTo(MessageFormatUtil.Format(PdfOcrTesseract4ExceptionMessageConstant.CANNOT_READ_PROVIDED_IMAGE, new FileInfo(TEST_IMAGES_DIRECTORY + "numbers_01").FullName))) -; + ); + NUnit.Framework.Assert.AreEqual(MessageFormatUtil.Format(PdfOcrTesseract4ExceptionMessageConstant.CANNOT_READ_PROVIDED_IMAGE + , new FileInfo(TEST_IMAGES_DIRECTORY + "numbers_01").FullName), exception.Message); } [LogMessage(Tesseract4LogMessageConstant.CANNOT_READ_INPUT_IMAGE)] @@ -77,15 +81,16 @@ public virtual void TestDoTesseractOcrForIncorrectImageForExecutable() { [LogMessage(Tesseract4LogMessageConstant.TESSERACT_FAILED)] [NUnit.Framework.Test] public virtual void TestOcrResultForSinglePageForNullImage() { - NUnit.Framework.Assert.That(() => { + Exception exception = NUnit.Framework.Assert.Catch(typeof(PdfOcrTesseract4Exception), () => { Tesseract4LibOcrEngine tesseract4LibOcrEngine = GetTesseract4LibOcrEngine(); tesseract4LibOcrEngine.SetTesseract4OcrEngineProperties(new Tesseract4OcrEngineProperties().SetPathToTessData (GetTessDataDirectory())); tesseract4LibOcrEngine.InitializeTesseract(OutputFormat.TXT); tesseract4LibOcrEngine.DoTesseractOcr(null, null, OutputFormat.HOCR); } - , NUnit.Framework.Throws.InstanceOf().With.Message.EqualTo(PdfOcrTesseract4ExceptionMessageConstant.TESSERACT_FAILED)) -; + ); + NUnit.Framework.Assert.AreEqual(PdfOcrTesseract4ExceptionMessageConstant.TESSERACT_FAILED, exception.Message + ); } [NUnit.Framework.Test] @@ -116,5 +121,19 @@ public virtual void TestDetectAndFixBrokenBBoxes() { NUnit.Framework.Assert.AreEqual(385.5, (float)textInfo.GetBboxRect().GetRight(), 0.1); NUnit.Framework.Assert.AreEqual(162.75, (float)textInfo.GetBboxRect().GetTop(), 0.1); } + + [NUnit.Framework.Test] + public virtual void TestTaggingNotSupportedForTesseract4ExecutableOcrEngine() { + Exception e = NUnit.Framework.Assert.Catch(typeof(PdfOcrException), () => new OcrPdfCreator(new Tesseract4ExecutableOcrEngine + (new Tesseract4OcrEngineProperties()), new OcrPdfCreatorProperties().SetTagged(true))); + NUnit.Framework.Assert.AreEqual(PdfOcrExceptionMessageConstant.TAGGING_IS_NOT_SUPPORTED, e.Message); + } + + [NUnit.Framework.Test] + public virtual void TestTaggingNotSupportedForTesseract4LibOcrEngine() { + Exception e = NUnit.Framework.Assert.Catch(typeof(PdfOcrException), () => new OcrPdfCreator(new Tesseract4LibOcrEngine + (new Tesseract4OcrEngineProperties()), new OcrPdfCreatorProperties().SetTagged(true))); + NUnit.Framework.Assert.AreEqual(PdfOcrExceptionMessageConstant.TAGGING_IS_NOT_SUPPORTED, e.Message); + } } } diff --git a/itext.tests/itext.pdfocr.tesseract4.tests/itext/pdfocr/tesseract4/ImageIntegrationTest.cs b/itext.tests/itext.pdfocr.tesseract4.tests/itext/pdfocr/tesseract4/ImageIntegrationTest.cs index 249bc6f..705df95 100644 --- a/itext.tests/itext.pdfocr.tesseract4.tests/itext/pdfocr/tesseract4/ImageIntegrationTest.cs +++ b/itext.tests/itext.pdfocr.tesseract4.tests/itext/pdfocr/tesseract4/ImageIntegrationTest.cs @@ -34,9 +34,13 @@ public abstract class ImageIntegrationTest : IntegrationTestHelper { private static readonly ILogger LOGGER = ITextLogManager.GetLogger(typeof(iText.Pdfocr.Tesseract4.ImageIntegrationTest )); +//\cond DO_NOT_DOCUMENT internal AbstractTesseract4OcrEngine tesseractReader; +//\endcond +//\cond DO_NOT_DOCUMENT internal String testFileTypeName; +//\endcond private bool isExecutableReaderType; diff --git a/itext.tests/itext.pdfocr.tesseract4.tests/itext/pdfocr/tesseract4/ImagePreprocessingUtilTest.cs b/itext.tests/itext.pdfocr.tesseract4.tests/itext/pdfocr/tesseract4/ImagePreprocessingUtilTest.cs index 03efcff..a74449e 100644 --- a/itext.tests/itext.pdfocr.tesseract4.tests/itext/pdfocr/tesseract4/ImagePreprocessingUtilTest.cs +++ b/itext.tests/itext.pdfocr.tesseract4.tests/itext/pdfocr/tesseract4/ImagePreprocessingUtilTest.cs @@ -41,13 +41,10 @@ public virtual void TestCheckForInvalidTiff() { [LogMessage(Tesseract4LogMessageConstant.CANNOT_READ_INPUT_IMAGE)] [NUnit.Framework.Test] public virtual void TestReadingInvalidImagePath() { - NUnit.Framework.Assert.That(() => { - String path = TEST_IMAGES_DIRECTORY + "numbers_02"; - FileInfo imgFile = new FileInfo(path); - ImagePreprocessingUtil.PreprocessImage(imgFile, 1, new ImagePreprocessingOptions()); - } - , NUnit.Framework.Throws.InstanceOf()) -; + String path = TEST_IMAGES_DIRECTORY + "numbers_02"; + FileInfo imgFile = new FileInfo(path); + NUnit.Framework.Assert.Catch(typeof(PdfOcrTesseract4Exception), () => ImagePreprocessingUtil.PreprocessImage + (imgFile, 1, new ImagePreprocessingOptions())); } [NUnit.Framework.Test] diff --git a/itext.tests/itext.pdfocr.tesseract4.tests/itext/pdfocr/tesseract4/TesseractHelperTest.cs b/itext.tests/itext.pdfocr.tesseract4.tests/itext/pdfocr/tesseract4/TesseractHelperTest.cs index 8aa6101..27655d6 100644 --- a/itext.tests/itext.pdfocr.tesseract4.tests/itext/pdfocr/tesseract4/TesseractHelperTest.cs +++ b/itext.tests/itext.pdfocr.tesseract4.tests/itext/pdfocr/tesseract4/TesseractHelperTest.cs @@ -34,9 +34,13 @@ public abstract class TesseractHelperTest : IntegrationTestHelper { private static readonly ILogger LOGGER = ITextLogManager.GetLogger(typeof(iText.Pdfocr.Tesseract4.TesseractHelperTest )); +//\cond DO_NOT_DOCUMENT internal AbstractTesseract4OcrEngine tesseractReader; +//\endcond +//\cond DO_NOT_DOCUMENT internal String testFileTypeName; +//\endcond private bool isExecutableReaderType; diff --git a/itext.tests/itext.pdfocr.tesseract4.tests/itext/pdfocr/tesseract4/UserWordsTest.cs b/itext.tests/itext.pdfocr.tesseract4.tests/itext/pdfocr/tesseract4/UserWordsTest.cs index b42b87a..0648027 100644 --- a/itext.tests/itext.pdfocr.tesseract4.tests/itext/pdfocr/tesseract4/UserWordsTest.cs +++ b/itext.tests/itext.pdfocr.tesseract4.tests/itext/pdfocr/tesseract4/UserWordsTest.cs @@ -29,9 +29,13 @@ You should have received a copy of the GNU Affero General Public License namespace iText.Pdfocr.Tesseract4 { public abstract class UserWordsTest : IntegrationTestHelper { +//\cond DO_NOT_DOCUMENT internal AbstractTesseract4OcrEngine tesseractReader; +//\endcond +//\cond DO_NOT_DOCUMENT internal String testFileTypeName; +//\endcond private bool isExecutableReaderType; @@ -85,25 +89,27 @@ public virtual void TestCustomUserWordsWithListOfLanguages() { [NUnit.Framework.Test] public virtual void TestUserWordsWithLanguageNotInList() { - NUnit.Framework.Assert.That(() => { + Exception exception = NUnit.Framework.Assert.Catch(typeof(PdfOcrTesseract4Exception), () => { String userWords = TEST_DOCUMENTS_DIRECTORY + "userwords.txt"; Tesseract4OcrEngineProperties properties = tesseractReader.GetTesseract4OcrEngineProperties(); properties.SetUserWords("spa", new FileStream(userWords, FileMode.Open, FileAccess.Read)); properties.SetLanguages(new List()); } - , NUnit.Framework.Throws.InstanceOf().With.Message.EqualTo(MessageFormatUtil.Format(PdfOcrTesseract4ExceptionMessageConstant.LANGUAGE_IS_NOT_IN_THE_LIST, "spa"))) -; + ); + NUnit.Framework.Assert.AreEqual(MessageFormatUtil.Format(PdfOcrTesseract4ExceptionMessageConstant.LANGUAGE_IS_NOT_IN_THE_LIST + , "spa"), exception.Message); } [NUnit.Framework.Test] public virtual void TestIncorrectLanguageForUserWordsAsList() { - NUnit.Framework.Assert.That(() => { + Exception exception = NUnit.Framework.Assert.Catch(typeof(PdfOcrTesseract4Exception), () => { Tesseract4OcrEngineProperties properties = tesseractReader.GetTesseract4OcrEngineProperties(); properties.SetUserWords("eng1", JavaUtil.ArraysAsList("word1", "word2")); properties.SetLanguages(new List()); } - , NUnit.Framework.Throws.InstanceOf().With.Message.EqualTo(MessageFormatUtil.Format(PdfOcrTesseract4ExceptionMessageConstant.LANGUAGE_IS_NOT_IN_THE_LIST, "eng1"))) -; + ); + NUnit.Framework.Assert.AreEqual(MessageFormatUtil.Format(PdfOcrTesseract4ExceptionMessageConstant.LANGUAGE_IS_NOT_IN_THE_LIST + , "eng1"), exception.Message); } [NUnit.Framework.Test] diff --git a/itext.tests/itext.pdfocr.tesseract4.tests/resources/itext/pdfocr/documents/invoice_front_thai_lib_dotnet.pdf b/itext.tests/itext.pdfocr.tesseract4.tests/resources/itext/pdfocr/documents/invoice_front_thai_lib_dotnet.pdf index 1f06512..fcd4b05 100644 Binary files a/itext.tests/itext.pdfocr.tesseract4.tests/resources/itext/pdfocr/documents/invoice_front_thai_lib_dotnet.pdf and b/itext.tests/itext.pdfocr.tesseract4.tests/resources/itext/pdfocr/documents/invoice_front_thai_lib_dotnet.pdf differ diff --git a/itext.tests/itext.pdfocr.tesseract4.tests/resources/itext/pdfocr/documents/invoice_front_thai_lib_java.pdf b/itext.tests/itext.pdfocr.tesseract4.tests/resources/itext/pdfocr/documents/invoice_front_thai_lib_java.pdf index eb6415a..d0d23a2 100644 Binary files a/itext.tests/itext.pdfocr.tesseract4.tests/resources/itext/pdfocr/documents/invoice_front_thai_lib_java.pdf and b/itext.tests/itext.pdfocr.tesseract4.tests/resources/itext/pdfocr/documents/invoice_front_thai_lib_java.pdf differ diff --git a/itext/itext.pdfocr.api/PdfOcrExtensions.cs b/itext/itext.pdfocr.api/PdfOcrExtensions.cs index dad4f06..5631ee2 100644 --- a/itext/itext.pdfocr.api/PdfOcrExtensions.cs +++ b/itext/itext.pdfocr.api/PdfOcrExtensions.cs @@ -29,7 +29,7 @@ You should have received a copy of the GNU Affero General Public License using System.Reflection; using iText.Pdfocr; - +//\cond DO_NOT_DOCUMENT internal static class PdfOcrExtensions { public static TValue Get(this IDictionary col, TKey key) @@ -55,3 +55,4 @@ public static bool IsEmpty(this ICollection> collec } } +//\endcond \ No newline at end of file diff --git a/itext/itext.pdfocr.api/Properties/AssemblyInfo.cs b/itext/itext.pdfocr.api/Properties/AssemblyInfo.cs index 2cd884e..619ffad 100644 --- a/itext/itext.pdfocr.api/Properties/AssemblyInfo.cs +++ b/itext/itext.pdfocr.api/Properties/AssemblyInfo.cs @@ -14,9 +14,9 @@ [assembly: Guid("0c4ceb00-9a56-4547-a925-5974a85a6048")] -[assembly: AssemblyVersion("3.0.2.0")] -[assembly: AssemblyFileVersion("3.0.2.0")] -[assembly: AssemblyInformationalVersion("3.0.2")] +[assembly: AssemblyVersion("4.0.0.0")] +[assembly: AssemblyFileVersion("4.0.0.0")] +[assembly: AssemblyInformationalVersion("4.0.0")] [assembly: InternalsVisibleTo("itext.pdfocr.api.tests, PublicKey=" + "00240000048000009400000006020000002400005253413100040000010001008b21ed5b3fc1c1" + "1996390981fe22bbe71a39a9e11d3c2cefddd6ee92920fa871f9666ae0fa941af0280d0653df04" + diff --git a/itext/itext.pdfocr.api/itext.pdfocr.api.csproj b/itext/itext.pdfocr.api/itext.pdfocr.api.csproj index 6e1943e..ff4c0f9 100644 --- a/itext/itext.pdfocr.api/itext.pdfocr.api.csproj +++ b/itext/itext.pdfocr.api/itext.pdfocr.api.csproj @@ -30,7 +30,7 @@ - + diff --git a/itext/itext.pdfocr.api/itext/pdfocr/IOcrEngine.cs b/itext/itext.pdfocr.api/itext/pdfocr/IOcrEngine.cs index dc5375f..d909560 100644 --- a/itext/itext.pdfocr.api/itext/pdfocr/IOcrEngine.cs +++ b/itext/itext.pdfocr.api/itext/pdfocr/IOcrEngine.cs @@ -132,5 +132,15 @@ public interface IOcrEngine { /// file to be created /// ocr processing context void CreateTxtFile(IList inputImages, FileInfo txtFile, OcrProcessContext ocrProcessContext); + + /// Checks whether tagging is supported by the OCR engine. + /// + /// + /// + /// if tagging is supported by the engine, + /// + /// otherwise + /// + bool IsTaggingSupported(); } } diff --git a/itext/itext.pdfocr.api/itext/pdfocr/OcrPdfCreator.cs b/itext/itext.pdfocr.api/itext/pdfocr/OcrPdfCreator.cs index 2d2dbee..9d28d90 100644 --- a/itext/itext.pdfocr.api/itext/pdfocr/OcrPdfCreator.cs +++ b/itext/itext.pdfocr.api/itext/pdfocr/OcrPdfCreator.cs @@ -113,6 +113,9 @@ public OcrPdfCreator(IOcrEngine ocrEngine) /// /// public OcrPdfCreator(IOcrEngine ocrEngine, OcrPdfCreatorProperties ocrPdfCreatorProperties) { + if (ocrPdfCreatorProperties.IsTagged() && !ocrEngine.IsTaggingSupported()) { + throw new PdfOcrException(PdfOcrExceptionMessageConstant.TAGGING_IS_NOT_SUPPORTED); + } SetOcrEngine(ocrEngine); SetOcrPdfCreatorProperties(ocrPdfCreatorProperties); } @@ -630,12 +633,8 @@ private void AddToCanvas(PdfDocument pdfDocument, Rectangle imageSize, IList logicalTree = new List(); // A map of leaf LogicalStructureTreeItem's to TextInfo's attached to these leaves - IDictionary> leavesTextInfos = new Dictionary>(); - bool taggedSupported = GetLogicalTree(pageText, logicalTree, leavesTextInfos); - if (!taggedSupported) { - throw new PdfOcrException(PdfOcrExceptionMessageConstant.TAGGING_IS_NOT_SUPPORTED); - } + IDictionary> leavesTextInfos = GetLogicalTree(pageText, logicalTree + ); pdfDocument.SetTagged(); // Create a map of TextInfo to tag pointers meanwhile creating the required tags. // Tag pointers are later used to put all the required info into canvas (content stream) @@ -661,8 +660,7 @@ private PdfDocument CreatePdfDocument(PdfWriter pdfWriter, PdfOutputIntent pdfOu PdfDocument pdfDocument; bool createPdfA3u = pdfOutputIntent != null; if (createPdfA3u) { - pdfDocument = new PdfADocument(pdfWriter, PdfAConformanceLevel.PDF_A_3U, pdfOutputIntent, documentProperties - ); + pdfDocument = new PdfADocument(pdfWriter, PdfAConformance.PDF_A_3U, pdfOutputIntent, documentProperties); } else { pdfDocument = new PdfDocument(pdfWriter, documentProperties); @@ -755,8 +753,8 @@ private void AddImageToCanvas(ImageData imageData, Rectangle imageSize, PdfCanva else { Point coordinates = PdfCreatorUtil.CalculateImageCoordinates(ocrPdfCreatorProperties.GetPageSize(), imageSize ); - Rectangle rect = new Rectangle((float)coordinates.x, (float)coordinates.y, imageSize.GetWidth(), imageSize - .GetHeight()); + Rectangle rect = new Rectangle((float)coordinates.GetX(), (float)coordinates.GetY(), imageSize.GetWidth(), + imageSize.GetHeight()); pdfCanvas.AddImageFittedIntoRectangle(imageData, rect, false); } if (ocrPdfCreatorProperties.IsTagged()) { @@ -765,18 +763,12 @@ private void AddImageToCanvas(ImageData imageData, Rectangle imageSize, PdfCanva } } - /// - /// - /// - /// if tagging supported by the engine. - /// - [System.ObsoleteAttribute(@"In next major version we need to add boolean taggingSupported() method into IOcrEngine and throw exception in OcrPdfCreator constructor if taggingSupported() returns false but OcrPdfCreatorProperties.getTagged returns true." - )] - private static bool GetLogicalTree(IList textInfos, IList logicalStructureTreeItems - , IDictionary> leavesTextInfos) { - bool taggedSupported = false; + private static IDictionary> GetLogicalTree(IList textInfos + , IList logicalStructureTreeItems) { + IDictionary> leavesTextInfos = new Dictionary>(); if (textInfos == null) { - return taggedSupported; + return leavesTextInfos; } foreach (TextInfo textInfo in textInfos) { LogicalStructureTreeItem structTreeItem = textInfo.GetLogicalStructureTreeItem(); @@ -787,7 +779,6 @@ private static bool GetLogicalTree(IList textInfos, IList textInfos, IList pageText, IDic else { paragraph.SetTextRenderingMode(PdfCanvasConstants.TextRenderingMode.INVISIBLE); } - canvas.ShowTextAligned(paragraph, xOffset + (float)imageCoordinates.x, yOffset + (float)imageCoordinates.y - , TextAlignment.LEFT); + canvas.ShowTextAligned(paragraph, xOffset + (float)imageCoordinates.GetX(), yOffset + (float)imageCoordinates + .GetY(), TextAlignment.LEFT); if (ocrPdfCreatorProperties.IsTagged()) { pdfCanvas.CloseTag(); } @@ -1023,7 +1014,7 @@ public override PdfCanvas ShowText(GlyphLine text) { // default value for error message, it'll be updated with the // unicode of the not found glyph String message = PdfOcrLogMessageConstant.COULD_NOT_FIND_CORRESPONDING_GLYPH_TO_UNICODE_CHARACTER; - for (int i = glyphLine.start; i < glyphLine.end; i++) { + for (int i = glyphLine.GetStart(); i < glyphLine.GetEnd(); i++) { if (IsNotDefGlyph(currentFont, glyphLine.Get(i))) { notDefGlyphsExists = true; message = MessageFormatUtil.Format(PdfOcrLogMessageConstant.COULD_NOT_FIND_CORRESPONDING_GLYPH_TO_UNICODE_CHARACTER diff --git a/itext/itext.pdfocr.api/itext/pdfocr/OcrPdfCreatorEventHelper.cs b/itext/itext.pdfocr.api/itext/pdfocr/OcrPdfCreatorEventHelper.cs index f5dec9b..f590a6b 100644 --- a/itext/itext.pdfocr.api/itext/pdfocr/OcrPdfCreatorEventHelper.cs +++ b/itext/itext.pdfocr.api/itext/pdfocr/OcrPdfCreatorEventHelper.cs @@ -27,15 +27,18 @@ You should have received a copy of the GNU Affero General Public License using iText.Pdfocr.Statistics; namespace iText.Pdfocr { +//\cond DO_NOT_DOCUMENT internal class OcrPdfCreatorEventHelper : AbstractPdfOcrEventHelper { private readonly SequenceId sequenceId; private readonly IMetaInfo metaInfo; +//\cond DO_NOT_DOCUMENT internal OcrPdfCreatorEventHelper(SequenceId sequenceId, IMetaInfo metaInfo) { this.sequenceId = sequenceId; this.metaInfo = metaInfo; } +//\endcond public override void OnEvent(AbstractProductITextEvent @event) { if (@event is AbstractContextBasedITextEvent) { @@ -58,4 +61,5 @@ public override EventConfirmationType GetConfirmationType() { return EventConfirmationType.ON_CLOSE; } } +//\endcond } diff --git a/itext/itext.pdfocr.api/itext/pdfocr/OcrPdfCreatorProperties.cs b/itext/itext.pdfocr.api/itext/pdfocr/OcrPdfCreatorProperties.cs index 75482a5..2c4f11a 100644 --- a/itext/itext.pdfocr.api/itext/pdfocr/OcrPdfCreatorProperties.cs +++ b/itext/itext.pdfocr.api/itext/pdfocr/OcrPdfCreatorProperties.cs @@ -516,10 +516,12 @@ public virtual iText.Pdfocr.OcrPdfCreatorProperties SetMetaInfo(IMetaInfo metaIn return this; } +//\cond DO_NOT_DOCUMENT /// Returns meta info /// meta info internal virtual IMetaInfo GetMetaInfo() { return metaInfo; } +//\endcond } } diff --git a/itext/itext.pdfocr.api/itext/pdfocr/OcrProcessContext.cs b/itext/itext.pdfocr.api/itext/pdfocr/OcrProcessContext.cs index 052fe89..6a7ce81 100644 --- a/itext/itext.pdfocr.api/itext/pdfocr/OcrProcessContext.cs +++ b/itext/itext.pdfocr.api/itext/pdfocr/OcrProcessContext.cs @@ -48,11 +48,13 @@ public virtual void SetOcrEventHelper(AbstractPdfOcrEventHelper eventHelper) { this.ocrEventHelper = eventHelper; } +//\cond DO_NOT_DOCUMENT /// Set extra OCR process properties. /// extra OCR process properties. internal virtual void SetOcrProcessProperties(IOcrProcessProperties ocrProcessProperties) { this.ocrProcessProperties = ocrProcessProperties; } +//\endcond /// Get extra OCR process properties. /// extra OCR process properties. diff --git a/itext/itext.pdfocr.api/itext/pdfocr/PdfCreatorUtil.cs b/itext/itext.pdfocr.api/itext/pdfocr/PdfCreatorUtil.cs index 00f7448..5e4bc1c 100644 --- a/itext/itext.pdfocr.api/itext/pdfocr/PdfCreatorUtil.cs +++ b/itext/itext.pdfocr.api/itext/pdfocr/PdfCreatorUtil.cs @@ -37,9 +37,12 @@ You should have received a copy of the GNU Affero General Public License using iText.Pdfocr.Logs; namespace iText.Pdfocr { +//\cond DO_NOT_DOCUMENT internal class PdfCreatorUtil { +//\cond DO_NOT_DOCUMENT /// The Constant to convert pixels to points. internal const float PX_TO_PT = 3f / 4f; +//\endcond /// The Constant for points per inch. private const float POINTS_PER_INCH = 72.0f; @@ -47,6 +50,7 @@ internal class PdfCreatorUtil { /// The logger. private static readonly ILogger LOGGER = ITextLogManager.GetLogger(typeof(PdfCreatorUtil)); +//\cond DO_NOT_DOCUMENT /// /// Calculates font size according to given bbox height, width and selected /// font. @@ -92,7 +96,9 @@ internal static float CalculateFontSize(Document document, String line, String f } return fontSize; } +//\endcond +//\cond DO_NOT_DOCUMENT /// /// Calculated real width of a paragraph with given text line, font provider /// and font size. @@ -113,7 +119,9 @@ internal static float GetRealLineWidth(Document document, String line, String fo IRenderer renderer = paragraph.CreateRendererSubTree().SetParent(document.GetRenderer()); return ((ParagraphRenderer)renderer).GetMinMaxWidth().GetMaxWidth(); } +//\endcond +//\cond DO_NOT_DOCUMENT /// Calculates image coordinates on the page. /// size of the page /// size of the image @@ -131,7 +139,9 @@ internal static Point CalculateImageCoordinates(Rectangle size, Rectangle imageS } return new Point(x, y); } +//\endcond +//\cond DO_NOT_DOCUMENT /// /// Retrieves /// @@ -189,7 +199,9 @@ internal static IList GetImageData(FileInfo inputImage, IImageRotatio } return images; } +//\endcond +//\cond DO_NOT_DOCUMENT /// /// Calculates the size of the PDF document page according to the provided /// . @@ -252,13 +264,16 @@ internal static Rectangle CalculateImageSize(ImageData imageData, ScaleMode scal return requiredSize; } } +//\endcond +//\cond DO_NOT_DOCUMENT /// Converts value from pixels to points. /// input value in pixels /// result value in points internal static float GetPoints(float pixels) { return pixels * PX_TO_PT; } +//\endcond /// Counts number of pages in the provided tiff image. /// @@ -274,4 +289,5 @@ private static int GetNumberOfPageTiff(FileInfo inputImage) { return numOfPages; } } +//\endcond } diff --git a/itext/itext.pdfocr.api/itext/pdfocr/PdfOcrMetaInfoContainer.cs b/itext/itext.pdfocr.api/itext/pdfocr/PdfOcrMetaInfoContainer.cs index 6bdfa92..47cc16e 100644 --- a/itext/itext.pdfocr.api/itext/pdfocr/PdfOcrMetaInfoContainer.cs +++ b/itext/itext.pdfocr.api/itext/pdfocr/PdfOcrMetaInfoContainer.cs @@ -33,8 +33,10 @@ public PdfOcrMetaInfoContainer(IMetaInfo metaInfo) { this.metaInfo = metaInfo; } +//\cond DO_NOT_DOCUMENT internal virtual IMetaInfo GetMetaInfo() { return metaInfo; } +//\endcond } } diff --git a/itext/itext.pdfocr.api/itext/pdfocr/statistics/PdfOcrOutputTypeStatisticsAggregator.cs b/itext/itext.pdfocr.api/itext/pdfocr/statistics/PdfOcrOutputTypeStatisticsAggregator.cs index 3b2e0dc..6d68c61 100644 --- a/itext/itext.pdfocr.api/itext/pdfocr/statistics/PdfOcrOutputTypeStatisticsAggregator.cs +++ b/itext/itext.pdfocr.api/itext/pdfocr/statistics/PdfOcrOutputTypeStatisticsAggregator.cs @@ -27,6 +27,7 @@ You should have received a copy of the GNU Affero General Public License using iText.Pdfocr.Exceptions; namespace iText.Pdfocr.Statistics { +//\cond DO_NOT_DOCUMENT /// Statistics aggregator which aggregates types of ocr processing. internal class PdfOcrOutputTypeStatisticsAggregator : AbstractStatisticsAggregator { private const String STRING_FOR_DATA = "data"; @@ -107,8 +108,11 @@ public override void Merge(AbstractStatisticsAggregator aggregator) { } } +//\cond DO_NOT_DOCUMENT internal static String GetKeyForType(PdfOcrOutputType type) { return OCR_OUTPUT_TYPES.Get(type); } +//\endcond } +//\endcond } diff --git a/itext/itext.pdfocr.api/pdfocr-api.nuspec b/itext/itext.pdfocr.api/pdfocr-api.nuspec index 0d06e07..e192ba9 100644 --- a/itext/itext.pdfocr.api/pdfocr-api.nuspec +++ b/itext/itext.pdfocr.api/pdfocr-api.nuspec @@ -2,7 +2,7 @@ itext.pdfocr.api - 3.0.2 + 4.0.0 iText pdfOcr Apryse Software Apryse Software @@ -18,7 +18,7 @@ - + diff --git a/itext/itext.pdfocr.tesseract4/PdfOcrTesseract4Extensions.cs b/itext/itext.pdfocr.tesseract4/PdfOcrTesseract4Extensions.cs index a7d6f31..80aed54 100644 --- a/itext/itext.pdfocr.tesseract4/PdfOcrTesseract4Extensions.cs +++ b/itext/itext.pdfocr.tesseract4/PdfOcrTesseract4Extensions.cs @@ -24,7 +24,7 @@ You should have received a copy of the GNU Affero General Public License using System.Collections.Generic; using System.Reflection; using System.Text; - +//\cond DO_NOT_DOCUMENT internal static class PdfOcrTesseract4Extensions { public static String Name(this Encoding e) @@ -74,3 +74,4 @@ public static Attribute GetCustomAttribute(this Assembly assembly, Type attribut } } +//\endcond \ No newline at end of file diff --git a/itext/itext.pdfocr.tesseract4/Properties/AssemblyInfo.cs b/itext/itext.pdfocr.tesseract4/Properties/AssemblyInfo.cs index 497e6c9..f393326 100644 --- a/itext/itext.pdfocr.tesseract4/Properties/AssemblyInfo.cs +++ b/itext/itext.pdfocr.tesseract4/Properties/AssemblyInfo.cs @@ -14,9 +14,9 @@ [assembly: Guid("0c4ceb00-9a56-4547-a925-5974a85a6048")] -[assembly: AssemblyVersion("3.0.2.0")] -[assembly: AssemblyFileVersion("3.0.2.0")] -[assembly: AssemblyInformationalVersion("3.0.2")] +[assembly: AssemblyVersion("4.0.0.0")] +[assembly: AssemblyFileVersion("4.0.0.0")] +[assembly: AssemblyInformationalVersion("4.0.0")] [assembly: InternalsVisibleTo("itext.pdfocr.tesseract4.tests, PublicKey=" + "00240000048000009400000006020000002400005253413100040000010001008b21ed5b3fc1c1" + "1996390981fe22bbe71a39a9e11d3c2cefddd6ee92920fa871f9666ae0fa941af0280d0653df04" + @@ -24,4 +24,4 @@ "009746bbdafcb75bcdbcecb7caf1f0f4b6e7d013906ba60b66eb1c8298e4efb052caf6cece4bf1" + "816902cc")] -[assembly: Versions.Attributes.KernelVersion("8.0.3.0")] +[assembly: Versions.Attributes.KernelVersion("9.0.0.0")] diff --git a/itext/itext.pdfocr.tesseract4/Properties/KernelVersionAttribute.cs b/itext/itext.pdfocr.tesseract4/Properties/KernelVersionAttribute.cs index 96767ba..5d0ba4a 100644 --- a/itext/itext.pdfocr.tesseract4/Properties/KernelVersionAttribute.cs +++ b/itext/itext.pdfocr.tesseract4/Properties/KernelVersionAttribute.cs @@ -23,6 +23,7 @@ You should have received a copy of the GNU Affero General Public License using System; namespace Versions.Attributes { + //\cond DO_NOT_DOCUMENT [AttributeUsage(AttributeTargets.Assembly)] internal class KernelVersionAttribute : Attribute { internal string KernelVersion { get; private set; } @@ -31,4 +32,5 @@ internal KernelVersionAttribute(string kernelVersion) { this.KernelVersion = kernelVersion; } } + //\endcond } diff --git a/itext/itext.pdfocr.tesseract4/itext/pdfocr/tesseract4/AbstractTesseract4OcrEngine.cs b/itext/itext.pdfocr.tesseract4/itext/pdfocr/tesseract4/AbstractTesseract4OcrEngine.cs index ad6ad60..2dbe075 100644 --- a/itext/itext.pdfocr.tesseract4/itext/pdfocr/tesseract4/AbstractTesseract4OcrEngine.cs +++ b/itext/itext.pdfocr.tesseract4/itext/pdfocr/tesseract4/AbstractTesseract4OcrEngine.cs @@ -47,9 +47,10 @@ namespace iText.Pdfocr.Tesseract4 { /// /// The implementation of /// . + /// /// This class provides possibilities to perform OCR, to read data from input /// files and to return contained text in the required format. - /// Also there are possibilities to use features of "tesseract" + /// Also, there are possibilities to use features of "tesseract" /// (optical character recognition engine for various operating systems). /// public abstract class AbstractTesseract4OcrEngine : IOcrEngine, IProductAware { @@ -58,7 +59,9 @@ public abstract class AbstractTesseract4OcrEngine : IOcrEngine, IProductAware { (new HashSet(JavaUtil.ArraysAsList(ImageType.BMP, ImageType.PNG, ImageType.TIFF, ImageType. JPEG))); +//\cond DO_NOT_DOCUMENT internal ICollection processedUUID = new HashSet(); +//\endcond /// Set of properties. private Tesseract4OcrEngineProperties tesseract4OcrEngineProperties; @@ -403,6 +406,11 @@ public virtual ProductData GetProductData() { return PdfOcrTesseract4ProductData.GetInstance(); } + public virtual bool IsTaggingSupported() { + return false; + } + +//\cond DO_NOT_DOCUMENT /// /// Performs tesseract OCR using command line tool /// or a wrapper for Tesseract OCR API. @@ -437,7 +445,9 @@ internal virtual void DoTesseractOcr(FileInfo inputImage, IList output , int pageNumber, AbstractPdfOcrEventHelper eventHelper) { DoTesseractOcr(inputImage, outputFiles, outputFormat, pageNumber, true, eventHelper); } +//\endcond +//\cond DO_NOT_DOCUMENT /// /// Performs tesseract OCR using command line tool /// or a wrapper for Tesseract OCR API. @@ -472,7 +482,9 @@ internal virtual void DoTesseractOcr(FileInfo inputImage, IList output /// event helper internal abstract void DoTesseractOcr(FileInfo inputImage, IList outputFiles, OutputFormat outputFormat , int pageNumber, bool dispatchEvent, AbstractPdfOcrEventHelper eventHelper); +//\endcond +//\cond DO_NOT_DOCUMENT /// Gets path to provided tess data directory. /// /// path to provided tess data directory as @@ -486,7 +498,9 @@ internal virtual String GetTessData() { return GetTesseract4OcrEngineProperties().GetPathToTessData().FullName; } } +//\endcond +//\cond DO_NOT_DOCUMENT internal virtual PdfOcrTesseract4ProductEvent OnEvent(AbstractPdfOcrEventHelper eventHelper) { // usage event PdfOcrTesseract4ProductEvent @event = PdfOcrTesseract4ProductEvent.CreateProcessImageEvent(eventHelper.GetSequenceId @@ -494,11 +508,14 @@ internal virtual PdfOcrTesseract4ProductEvent OnEvent(AbstractPdfOcrEventHelper eventHelper.OnEvent(@event); return @event; } +//\endcond +//\cond DO_NOT_DOCUMENT internal virtual void OnEventStatistics(AbstractPdfOcrEventHelper eventHelper) { eventHelper.OnEvent(new PdfOcrOutputTypeStatisticsEvent(PdfOcrOutputType.DATA, PdfOcrTesseract4ProductData .GetInstance())); } +//\endcond /// Reads data from the provided input image file. /// @@ -619,31 +636,45 @@ private void VerifyImageFormatValidity(FileInfo image) { } } +//\cond DO_NOT_DOCUMENT internal interface ITesseractOcrResult { } +//\endcond +//\cond DO_NOT_DOCUMENT internal class StringTesseractOcrResult : AbstractTesseract4OcrEngine.ITesseractOcrResult { private String data; +//\cond DO_NOT_DOCUMENT internal StringTesseractOcrResult(String data) { this.data = data; } +//\endcond +//\cond DO_NOT_DOCUMENT internal virtual String GetData() { return data; } +//\endcond } +//\endcond +//\cond DO_NOT_DOCUMENT internal class TextInfoTesseractOcrResult : AbstractTesseract4OcrEngine.ITesseractOcrResult { private IDictionary> textInfos; +//\cond DO_NOT_DOCUMENT internal TextInfoTesseractOcrResult(IDictionary> textInfos) { this.textInfos = textInfos; } +//\endcond +//\cond DO_NOT_DOCUMENT internal virtual IDictionary> GetTextInfos() { return this.textInfos; } +//\endcond } +//\endcond } } diff --git a/itext/itext.pdfocr.tesseract4/itext/pdfocr/tesseract4/ImagePreprocessingUtil.cs b/itext/itext.pdfocr.tesseract4/itext/pdfocr/tesseract4/ImagePreprocessingUtil.cs index 1ba18d5..78afc3d 100644 --- a/itext/itext.pdfocr.tesseract4/itext/pdfocr/tesseract4/ImagePreprocessingUtil.cs +++ b/itext/itext.pdfocr.tesseract4/itext/pdfocr/tesseract4/ImagePreprocessingUtil.cs @@ -33,6 +33,7 @@ You should have received a copy of the GNU Affero General Public License using iText.Pdfocr.Tesseract4.Logs; namespace iText.Pdfocr.Tesseract4 { +//\cond DO_NOT_DOCUMENT /// Utilities class to work with images. /// /// Utilities class to work with images. @@ -47,6 +48,7 @@ internal class ImagePreprocessingUtil { private ImagePreprocessingUtil() { } +//\cond DO_NOT_DOCUMENT /// Counts number of pages in the provided tiff image. /// /// input image @@ -60,7 +62,9 @@ internal static int GetNumberOfPageTiff(FileInfo inputImage) { raf.Close(); return numOfPages; } +//\endcond +//\cond DO_NOT_DOCUMENT /// Checks whether image format is TIFF. /// /// input image @@ -70,7 +74,9 @@ internal static int GetNumberOfPageTiff(FileInfo inputImage) { internal static bool IsTiffImage(FileInfo inputImage) { return GetImageType(inputImage) == ImageType.TIFF; } +//\endcond +//\cond DO_NOT_DOCUMENT /// Gets the image type. /// /// input image @@ -93,7 +99,9 @@ internal static ImageType GetImageType(FileInfo inputImage) { } return type; } +//\endcond +//\cond DO_NOT_DOCUMENT /// Reads provided image file using stream. /// /// input image @@ -110,7 +118,9 @@ internal static System.Drawing.Bitmap ReadImageFromFile(FileInfo inputFile) { @is.Dispose(); return bi; } +//\endcond +//\cond DO_NOT_DOCUMENT /// /// Reads input file as Leptonica /// @@ -131,7 +141,9 @@ internal static System.Drawing.Bitmap ReadAsPixAndConvertToBufferedImage(FileInf Pix pix = TesseractOcrUtil.ReadPixFromFile(inputImage); return TesseractOcrUtil.ConvertPixToImage(pix); } +//\endcond +//\cond DO_NOT_DOCUMENT /// Performs basic image preprocessing using buffered image (if provided). /// /// Performs basic image preprocessing using buffered image (if provided). @@ -166,7 +178,9 @@ internal static Pix PreprocessImage(FileInfo inputFile, int pageNumber, ImagePre } return TesseractOcrUtil.PreprocessPix(pix, imagePreprocessingOptions); } +//\endcond +//\cond DO_NOT_DOCUMENT /// /// Reads input image as a /// . @@ -216,5 +230,7 @@ internal static System.Drawing.Bitmap ReadImage(FileInfo inputImage) { } return bufferedImage; } +//\endcond } +//\endcond } diff --git a/itext/itext.pdfocr.tesseract4/itext/pdfocr/tesseract4/Tesseract4EventHelper.cs b/itext/itext.pdfocr.tesseract4/itext/pdfocr/tesseract4/Tesseract4EventHelper.cs index ac9a871..ad8704d 100644 --- a/itext/itext.pdfocr.tesseract4/itext/pdfocr/tesseract4/Tesseract4EventHelper.cs +++ b/itext/itext.pdfocr.tesseract4/itext/pdfocr/tesseract4/Tesseract4EventHelper.cs @@ -26,10 +26,13 @@ You should have received a copy of the GNU Affero General Public License using iText.Pdfocr; namespace iText.Pdfocr.Tesseract4 { +//\cond DO_NOT_DOCUMENT /// Helper class for working with events. internal class Tesseract4EventHelper : AbstractPdfOcrEventHelper { +//\cond DO_NOT_DOCUMENT internal Tesseract4EventHelper() { } +//\endcond // do nothing public override void OnEvent(AbstractProductITextEvent @event) { @@ -47,4 +50,5 @@ public override EventConfirmationType GetConfirmationType() { return EventConfirmationType.ON_DEMAND; } } +//\endcond } diff --git a/itext/itext.pdfocr.tesseract4/itext/pdfocr/tesseract4/Tesseract4ExecutableOcrEngine.cs b/itext/itext.pdfocr.tesseract4/itext/pdfocr/tesseract4/Tesseract4ExecutableOcrEngine.cs index 59d052f..751c83e 100644 --- a/itext/itext.pdfocr.tesseract4/itext/pdfocr/tesseract4/Tesseract4ExecutableOcrEngine.cs +++ b/itext/itext.pdfocr.tesseract4/itext/pdfocr/tesseract4/Tesseract4ExecutableOcrEngine.cs @@ -44,6 +44,7 @@ namespace iText.Pdfocr.Tesseract4 { /// The implementation of /// /// for tesseract OCR. + /// /// This class provides possibilities to use features of "tesseract" CL tool /// (optical character recognition engine for various operating systems). /// Please note that it's assumed that "tesseract" has already been @@ -97,6 +98,7 @@ public void SetPathToExecutable(String path) { pathToExecutable = path; } +//\cond DO_NOT_DOCUMENT /// /// Performs tesseract OCR using command line tool for the selected page /// of input image (by default 1st). @@ -218,6 +220,7 @@ internal override void DoTesseractOcr(FileInfo inputImage, IList outpu } } } +//\endcond /// Sets hocr output format. /// result command as list of strings diff --git a/itext/itext.pdfocr.tesseract4/itext/pdfocr/tesseract4/Tesseract4FileResultEventHelper.cs b/itext/itext.pdfocr.tesseract4/itext/pdfocr/tesseract4/Tesseract4FileResultEventHelper.cs index b15802a..a2f6511 100644 --- a/itext/itext.pdfocr.tesseract4/itext/pdfocr/tesseract4/Tesseract4FileResultEventHelper.cs +++ b/itext/itext.pdfocr.tesseract4/itext/pdfocr/tesseract4/Tesseract4FileResultEventHelper.cs @@ -27,17 +27,22 @@ You should have received a copy of the GNU Affero General Public License using iText.Pdfocr.Tesseract4.Actions.Events; namespace iText.Pdfocr.Tesseract4 { +//\cond DO_NOT_DOCUMENT /// Helper class for working with events. internal class Tesseract4FileResultEventHelper : AbstractPdfOcrEventHelper { private AbstractPdfOcrEventHelper wrappedEventHelper; +//\cond DO_NOT_DOCUMENT internal Tesseract4FileResultEventHelper() : this(null) { } +//\endcond +//\cond DO_NOT_DOCUMENT internal Tesseract4FileResultEventHelper(AbstractPdfOcrEventHelper wrappedEventHelper) { this.wrappedEventHelper = wrappedEventHelper == null ? new Tesseract4EventHelper() : wrappedEventHelper; } +//\endcond public override void OnEvent(AbstractProductITextEvent @event) { if (!IsProcessImageEvent(@event) && !IsConfirmForProcessImageEvent(@event)) { @@ -64,4 +69,5 @@ private static bool IsConfirmForProcessImageEvent(AbstractProductITextEvent @eve ()); } } +//\endcond } diff --git a/itext/itext.pdfocr.tesseract4/itext/pdfocr/tesseract4/Tesseract4LibOcrEngine.cs b/itext/itext.pdfocr.tesseract4/itext/pdfocr/tesseract4/Tesseract4LibOcrEngine.cs index 0f66c02..7d509ef 100644 --- a/itext/itext.pdfocr.tesseract4/itext/pdfocr/tesseract4/Tesseract4LibOcrEngine.cs +++ b/itext/itext.pdfocr.tesseract4/itext/pdfocr/tesseract4/Tesseract4LibOcrEngine.cs @@ -44,8 +44,10 @@ namespace iText.Pdfocr.Tesseract4 { /// The implementation of /// /// for tesseract OCR. + /// /// This class provides possibilities to use features of "tesseract" /// using tess4j. + /// /// Please note that this class is not thread-safe, in other words this Tesseract engine cannot /// be used for multithreaded processing. You should create one instance per thread /// @@ -120,6 +122,7 @@ public virtual void InitializeTesseract(OutputFormat outputFormat) { ().GetPageSegMode(), GetTesseract4OcrEngineProperties().GetPathToUserWordsFile()); } +//\cond DO_NOT_DOCUMENT /// /// Performs tesseract OCR using wrapper for Tesseract OCR API for the selected page /// of input image (by default 1st). @@ -215,6 +218,7 @@ internal override void DoTesseractOcr(FileInfo inputImage, IList outpu } } } +//\endcond /// /// Validates Tess Data path, diff --git a/itext/itext.pdfocr.tesseract4/itext/pdfocr/tesseract4/Tesseract4MetaInfo.cs b/itext/itext.pdfocr.tesseract4/itext/pdfocr/tesseract4/Tesseract4MetaInfo.cs index 2c69b19..2a4d981 100644 --- a/itext/itext.pdfocr.tesseract4/itext/pdfocr/tesseract4/Tesseract4MetaInfo.cs +++ b/itext/itext.pdfocr.tesseract4/itext/pdfocr/tesseract4/Tesseract4MetaInfo.cs @@ -23,6 +23,8 @@ You should have received a copy of the GNU Affero General Public License using iText.Commons.Actions.Contexts; namespace iText.Pdfocr.Tesseract4 { +//\cond DO_NOT_DOCUMENT internal class Tesseract4MetaInfo : IMetaInfo { } +//\endcond } diff --git a/itext/itext.pdfocr.tesseract4/itext/pdfocr/tesseract4/Tesseract4OcrEngineProperties.cs b/itext/itext.pdfocr.tesseract4/itext/pdfocr/tesseract4/Tesseract4OcrEngineProperties.cs index b74acff..8210a48 100644 --- a/itext/itext.pdfocr.tesseract4/itext/pdfocr/tesseract4/Tesseract4OcrEngineProperties.cs +++ b/itext/itext.pdfocr.tesseract4/itext/pdfocr/tesseract4/Tesseract4OcrEngineProperties.cs @@ -36,12 +36,14 @@ namespace iText.Pdfocr.Tesseract4 { /// . /// public class Tesseract4OcrEngineProperties : OcrEngineProperties { +//\cond DO_NOT_DOCUMENT /// Default suffix for user-word file. /// /// Default suffix for user-word file. /// (e.g. name: 'eng.user-words') /// internal const String DEFAULT_USER_WORDS_SUFFIX = "user-words"; +//\endcond /// Default language for OCR. private const String DEFAULT_LANGUAGE = "eng"; @@ -248,6 +250,7 @@ public iText.Pdfocr.Tesseract4.Tesseract4OcrEngineProperties SetTextPositioning( return this; } +//\cond DO_NOT_DOCUMENT /// /// Using provided list of words there will be created /// temporary file containing words (one per line) which @@ -301,7 +304,9 @@ internal virtual iText.Pdfocr.Tesseract4.Tesseract4OcrEngineProperties SetUserWo } return this; } +//\endcond +//\cond DO_NOT_DOCUMENT /// /// Using provided input stream there will be created /// temporary file (with name 'language.user-words') @@ -366,7 +371,9 @@ internal virtual iText.Pdfocr.Tesseract4.Tesseract4OcrEngineProperties SetUserWo } return this; } +//\endcond +//\cond DO_NOT_DOCUMENT /// Returns path to the user words file. /// /// Returns path to the user words file. @@ -383,7 +390,9 @@ internal virtual iText.Pdfocr.Tesseract4.Tesseract4OcrEngineProperties SetUserWo internal String GetPathToUserWordsFile() { return pathToUserWordsFile; } +//\endcond +//\cond DO_NOT_DOCUMENT /// Sets path to the user words file. /// /// Sets path to the user words file. @@ -405,7 +414,9 @@ internal iText.Pdfocr.Tesseract4.Tesseract4OcrEngineProperties SetPathToUserWord ) { return SetPathToUserWordsFile(pathToUserWordsFile, false); } +//\endcond +//\cond DO_NOT_DOCUMENT /// Sets path to the user words file. /// /// path to user words file @@ -424,12 +435,15 @@ internal iText.Pdfocr.Tesseract4.Tesseract4OcrEngineProperties SetPathToUserWord this.isUserWordsFileTemporary = isTempFile; return this; } +//\endcond +//\cond DO_NOT_DOCUMENT /// Indicates if user words file is temporary and has to be removed. /// true if the file is temporary, otherwise false. internal bool IsUserWordsFileTemporary() { return isUserWordsFileTemporary; } +//\endcond /// /// Gets diff --git a/itext/itext.pdfocr.tesseract4/itext/pdfocr/tesseract4/TesseractHelper.cs b/itext/itext.pdfocr.tesseract4/itext/pdfocr/tesseract4/TesseractHelper.cs index 5002325..da9940d 100644 --- a/itext/itext.pdfocr.tesseract4/itext/pdfocr/tesseract4/TesseractHelper.cs +++ b/itext/itext.pdfocr.tesseract4/itext/pdfocr/tesseract4/TesseractHelper.cs @@ -97,6 +97,7 @@ public class TesseractHelper { private TesseractHelper() { } +//\cond DO_NOT_DOCUMENT /// /// Parses each hocr file from the provided list, retrieves text, and /// returns data in the format described below. @@ -164,7 +165,9 @@ internal static IDictionary> ParseHocrFile(IList } return imageData; } +//\endcond +//\cond DO_NOT_DOCUMENT /// Get and align (if needed) bbox of the element. internal static Rectangle GetAlignedBBox(iText.StyledXmlParser.Jsoup.Nodes.Element @object, TextPositioning textPositioning, Rectangle pageBbox, IDictionary unparsedBBoxes @@ -180,7 +183,9 @@ internal static Rectangle GetAlignedBBox(iText.StyledXmlParser.Jsoup.Nodes.Eleme } return box; } +//\endcond +//\cond DO_NOT_DOCUMENT /// Parses element bbox. /// element containing bbox /// element containing parent page bbox @@ -216,7 +221,9 @@ internal static Rectangle ParseBBox(iText.StyledXmlParser.Jsoup.Nodes.Node node, ToPoints(bbox[RIGHT_IDX]), pageBBox.GetTop() - ToPoints(bbox[BOTTOM_IDX])); } } +//\endcond +//\cond DO_NOT_DOCUMENT /// Sometimes hOCR file contains broke character bboxes which are equal to page bbox. /// /// Sometimes hOCR file contains broke character bboxes which are equal to page bbox. @@ -246,17 +253,23 @@ internal static void DetectAndFixBrokenBBoxes(iText.StyledXmlParser.Jsoup.Nodes. } } } +//\endcond +//\cond DO_NOT_DOCUMENT /// Converts points to pixels. internal static float ToPixels(float pt) { return pt / PX_TO_PT; } +//\endcond +//\cond DO_NOT_DOCUMENT /// Converts pixels to points. internal static float ToPoints(float px) { return px * PX_TO_PT; } +//\endcond +//\cond DO_NOT_DOCUMENT /// Deletes file using provided path. /// path to the file to be deleted internal static void DeleteFile(String pathToFile) { @@ -275,7 +288,9 @@ internal static void DeleteFile(String pathToFile) { , e.Message)); } } +//\endcond +//\cond DO_NOT_DOCUMENT /// Reads from text file to string. /// /// input @@ -299,7 +314,9 @@ internal static String ReadTxtFile(FileInfo txtFile) { } return content; } +//\endcond +//\cond DO_NOT_DOCUMENT /// /// Writes provided /// @@ -326,7 +343,9 @@ internal static void WriteToTextFile(String path, String data) { throw new PdfOcrInputTesseract4Exception(PdfOcrTesseract4ExceptionMessageConstant.CANNOT_WRITE_TO_FILE, e); } } +//\endcond +//\cond DO_NOT_DOCUMENT /// Runs given command. /// path to the executable /// @@ -337,7 +356,9 @@ internal static void WriteToTextFile(String path, String data) { internal static void RunCommand(String execPath, IList paramsList) { RunCommand(execPath, paramsList, null); } +//\endcond +//\cond DO_NOT_DOCUMENT /// Runs given command from the specific working directory. /// path to the executable /// @@ -361,6 +382,7 @@ internal static void RunCommand(String execPath, IList paramsList, Strin throw new PdfOcrTesseract4Exception(PdfOcrTesseract4ExceptionMessageConstant.TESSERACT_FAILED); } } +//\endcond /// Gets list of text infos from hocr page. private static IList GetTextData(iText.StyledXmlParser.Jsoup.Nodes.Element page, Tesseract4OcrEngineProperties diff --git a/itext/itext.pdfocr.tesseract4/itext/pdfocr/tesseract4/TesseractOcrUtil.cs b/itext/itext.pdfocr.tesseract4/itext/pdfocr/tesseract4/TesseractOcrUtil.cs index 4969cdf..f739009 100644 --- a/itext/itext.pdfocr.tesseract4/itext/pdfocr/tesseract4/TesseractOcrUtil.cs +++ b/itext/itext.pdfocr.tesseract4/itext/pdfocr/tesseract4/TesseractOcrUtil.cs @@ -38,6 +38,8 @@ You should have received a copy of the GNU Affero General Public License using Tesseract; namespace iText.Pdfocr.Tesseract4 { + + //\cond DO_NOT_DOCUMENT /// /// Utilities class to work with tesseract command line tool and image /// preprocessing using @@ -1046,4 +1048,5 @@ internal static ImageData ApplyRotation(ImageData imageData) } } } + //\endcond } diff --git a/itext/itext.pdfocr.tesseract4/itext/pdfocr/tesseract4/actions/data/PdfOcrTesseract4ProductData.cs b/itext/itext.pdfocr.tesseract4/itext/pdfocr/tesseract4/actions/data/PdfOcrTesseract4ProductData.cs index 68c0304..4235247 100644 --- a/itext/itext.pdfocr.tesseract4/itext/pdfocr/tesseract4/actions/data/PdfOcrTesseract4ProductData.cs +++ b/itext/itext.pdfocr.tesseract4/itext/pdfocr/tesseract4/actions/data/PdfOcrTesseract4ProductData.cs @@ -34,7 +34,7 @@ public class PdfOcrTesseract4ProductData { private const String PDF_OCR_TESSERACT4_PUBLIC_PRODUCT_NAME = "pdfOCR-Tesseract4"; - private const String PDF_OCR_VERSION = "3.0.2"; + private const String PDF_OCR_VERSION = "4.0.0"; private const int PDF_OCR_COPYRIGHT_SINCE = 2000; diff --git a/itext/itext.pdfocr.tesseract4/pdfocr-tesseract4.nuspec b/itext/itext.pdfocr.tesseract4/pdfocr-tesseract4.nuspec index f9e829b..528d7b2 100644 --- a/itext/itext.pdfocr.tesseract4/pdfocr-tesseract4.nuspec +++ b/itext/itext.pdfocr.tesseract4/pdfocr-tesseract4.nuspec @@ -2,7 +2,7 @@ itext.pdfocr.tesseract4 - 3.0.2 + 4.0.0 iText pdfOcr Apryse Software Apryse Software @@ -18,7 +18,7 @@ - + diff --git a/port-hash b/port-hash index 9e2b362..5591138 100644 --- a/port-hash +++ b/port-hash @@ -1 +1 @@ -151cd11a394f65e278ed1cafd7d8a24a461940ad +55a45d5e726da0a60917476f8417bb86c97ebeff