-
Notifications
You must be signed in to change notification settings - Fork 1
/
Copy pathOCR4Linux.py
277 lines (240 loc) · 10.2 KB
/
OCR4Linux.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
# ========================================================================================================================
# Author:
# Mohamed Hussein Al-Adawy
# Version: 1.2.0
# Description:
# OCR4Linux.py is a Python script that handles image preprocessing and text extraction using Tesseract OCR.
# The script takes an input image, processes it for optimal OCR accuracy, and extracts text while preserving
# line breaks and layout.
#
# Features:
# - Image preprocessing (grayscale conversion, thresholding, noise removal)
# - Text extraction with layout preservation
# - Confidence-based filtering for improved accuracy
# - Support for multiple image formats
# - UTF-8 text output
#
# Dependencies:
# - PIL (Python Imaging Library)
# - pytesseract
# - OpenCV (cv2)
# - numpy
#
# Class Structure:
# TesseractConfig:
# - preprocess_image(): Enhances image quality for better OCR
# - extract_text_with_lines(): Extracts text while preserving layout
# - help(): Displays usage instructions
# - main(): Orchestrates the OCR process
#
# Usage:
# python OCR4Linux.py <image_path> <output_path>
#
# Example:
# python OCR4Linux.py screenshot.png output.txt
# ========================================================================================================================
import sys
import os
from PIL import Image
import pytesseract
class TesseractConfig:
"""
TesseractConfig is a class that configures and uses Tesseract OCR to extract text from images.
langs (str): The languages to be used by Tesseract for OCR.
custom_config (str): Custom configuration string for Tesseract.
ouput_encoding (str): The encoding to be used for the output file.
Methods:
__init__(self, image_path: str, output_path: str):
Initializes the TesseractConfig class with the provided image and output file paths.
extract_text_with_lines(image: Image) -> str:
Uses Tesseract OCR to extract text from the provided image, preserving line breaks.
main() -> int:
Main function to process the image and extract text. Performs validation, image processing,
text extraction, and saves the extracted text to an output file. Returns 0 if successful, 1 otherwise.
"""
def __init__(self, image_path: str, output_path: str):
"""
Initializes the OCR4Linux class with command-line arguments.
Attributes:
image_path (str): The path to the input image file.
output_path (str): The path to the output file where results will be saved.
oem_mode (int): The OCR Engine Mode (OEM) for Tesseract.
psm_mode (int): The Page Segmentation Mode (PSM) for Tesseract.
langs (str): The languages to be used by Tesseract for OCR.
custom_config (str): Custom configuration string for Tesseract.
ouput_encoding (str): The encoding to be used for the output file.
"""
self.image_path = image_path
self.output_path = output_path
self.oem_mode = 3 # Default LSTM engine
self.psm_mode = 6 # Uniform block of text
self.available_langs = pytesseract.get_languages()
self.langs = '+'.join(filter(None, self.available_langs)
) if self.available_langs else 'eng'
self.custom_config = f'--oem {self.oem_mode} --psm {self.psm_mode}'
self.ouput_encoding = 'utf-8'
def extract_text_with_lines(self, image: Image) -> str:
"""
This method uses Tesseract OCR to extract text from the provided image.
Args:
image: The image from which to extract text. This should be a format
supported by the pytesseract library.
Returns:
A string containing the extracted text with line breaks preserved.
"""
return pytesseract.image_to_string(
image=image, lang=self.langs, config=self.custom_config)
def main(self) -> int:
"""
Main function to process the image and extract text.
This function performs the following steps:
1. Extracts text from the processed image while preserving line breaks.
2. Saves the extracted text to an output file.
Returns:
int: 0 if text extraction is successful, 1 otherwise.
"""
try:
# Open and process the image
with Image.open(self.image_path) as image:
# Extract text with line preservation
extracted_text = self.extract_text_with_lines(image)
# Save the extracted text to a file
with open(self.output_path, 'w', encoding=self.ouput_encoding) as file:
file.write(extracted_text)
return 0
except Exception as e:
print(f"Error processing image because: {str(e)}")
return 1
class Program:
def __init__(self):
"""
Initializes the OCR4Linux class with the following attributes:
- args_num: Number of arguments expected by the script.
- author: Author of the script.
- email: Author's email address.
- github: URL to the GitHub repository.
- version: Version of the script.
- description: Brief description of the script's functionality.
- useges: List of usage examples for the script.
- examples: List of example commands for using the script.
- arguments: List of arguments that the script accepts with their descriptions.
"""
self.args_num = 3
self.author = "Mohamed Hussein Al-Adawy"
self.email = "mohamed.h.eladwy@gmail.com"
self.github = "https://github.com/moheladwy/OCR4Linux"
self.version = "1.2.0"
self.description = \
" OCR4Linux.py is a Python script that handles image preprocessing\n" + \
" and text extraction using Tesseract OCR. The script takes an input\n" + \
" based on the language in the image."
self.useges = [
"python OCR4Linux.py <image_path> <output_path>",
"python OCR4Linux.py [-l | --list-langs]",
"python OCR4Linux.py [-h | --help]"
]
self.examples = [
"python OCR4Linux.py screenshot.png output.txt",
"python OCR4Linux.py -l",
"python OCR4Linux.py -h"
]
self.arguments = [
"file_path: Path to the python script",
"image_path: Path to the image file",
"output_path: Path to the output text file",
"-l, --list-langs: List all available languages for OCR in the system",
"-h, --help: Display this help message, then exit"
]
def help(self) -> None:
"""
Prints the usage instructions for the OCR4Linux script.
This method displays the correct way to run the script, including the required
arguments and their descriptions. It also provides examples of how to use the script.
"""
print("OCR4Linux - OCR script for Linux using Tesseract")
print(f"Version: {self.version}")
print(f"Author: {self.author}")
print(f"Email: {self.email}")
print(f"GitHub: {self.github}")
print()
print("Description:")
print(self.description)
print()
print("Usage:")
for usege in self.useges:
print(f" - {usege}")
print()
print("Example:")
for example in self.examples:
print(f" - {example}")
print()
print("Arguments:")
for argument in self.arguments:
print(f" {argument}")
def check_arguments(self) -> int:
"""
Checks the command line arguments for validity.
Handles the following options:
- Standard usage: <image_path> <output_path>
- Help: -h or --help
- List languages: -l or --list-langs
Returns:
bool: True if arguments are valid, False otherwise.
"""
if len(sys.argv) == 2 and sys.argv[1] in ['-l', '--list-langs']:
self.list_available_languages()
return 0
elif len(sys.argv) == 2 and sys.argv[1] in ['-h', '--help']:
self.help()
return 0
elif len(sys.argv) != self.args_num:
self.help()
return 1
return 2
def list_available_languages(self) -> None:
"""
Displays all available languages for Tesseract OCR.
"""
langs = pytesseract.get_languages()
if not langs:
print("Error: No languages found")
return
print("Available languages for OCR:")
for lang in langs:
print(f" - {lang}")
def check_image_path(self, image_path: str) -> bool:
"""
Checks if the specified image file exists.
Args:
image_path: The path to the image file to be checked.
Returns:
bool: True if the image file exists, False otherwise.
"""
if not os.path.exists(image_path):
print(f"Error: File '{image_path}' not found")
return False
return True
def main(self):
"""
Main function to execute the OCR process.
This function performs the following steps:
1. Checks if the correct number of arguments is provided.
2. Verifies if the image file exists.
3. Creates an instance of the TesseractConfig class and runs the OCR process.
Returns:
int: Returns 1 if there is an error with the arguments or image path, otherwise returns the result of the TesseractConfig main function.
"""
# Check if the correct number of arguments is provided
result = self.check_arguments()
if result == 1:
return 1
elif result == 0:
return 0
# Check if the image file exists
if not self.check_image_path(sys.argv[1]):
return 1
# Create an instance of the TesseractConfig class
tesseract = TesseractConfig(sys.argv[1], sys.argv[2])
return tesseract.main()
if __name__ == "__main__":
sys.exit(Program().main())