forked from ocropus/hocr-tools
-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathhocr-split
executable file
·41 lines (31 loc) · 1.2 KB
/
hocr-split
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
#!/usr/bin/env python
# split an hOCR file into individual pages
from __future__ import print_function
import argparse
import re
from lxml import etree, html
################################################################
# main program
################################################################
parser = argparse.ArgumentParser(
description="split a multipage hOCR file into single pages")
parser.add_argument("file", help="hOCR file", type=argparse.FileType('r'))
parser.add_argument("pattern", help="naming pattern, e.g. 'base-%%03d.html'")
parser.add_argument("--start-from-zero", action="store_true",
help="start number of page from zero, e.g. 'base-000.html'")
args = parser.parse_args()
assert re.search('%[0-9]*d', args.pattern)
doc = etree.parse(args.file, html.XHTMLParser())
pages = doc.xpath("//*[@class='ocr_page']")
assert pages != []
container = pages[0].getparent()
index = 1
if args.start_from_zero:
index = 0
for new_page in pages:
container_pages = container.xpath("//*[@class='ocr_page']")
for page in container_pages:
container.remove(page)
container.append(new_page)
doc.write((args.pattern % index), pretty_print=True)
index += 1