-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathpisgah_pdf.py
183 lines (167 loc) · 7.92 KB
/
pisgah_pdf.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
#!/usr/bin/env python3
#
# 2021 11 08 jcm added code to search more carefully for the case number, since different
# systems produce slightly different formats for PDF files
# 2021 11 08 jcm added code to give a warning if keywords were not found in the input files:
# CIPRS will have: "Court Case:"
# Lexis will have: "Case Number:"
#
#
# Inputs (provided by legal department)
# Two PDF files as input
# - one Lexis report PDF file and
# - one CIPRS report PDF file
#
# Outputs
# One text file as output, which has a list of all case numbers in the Lexis PDF report that are
# NOT in the CIPRS PDF report, with the name:
# - one text output file (also shown on the screen) called "comparison.txt"
# (default location is same directory as exe file)
#
# Note: As per client, we are using the last 6 characters of the case number identifies the case
# number, even though the other characters might be coded differently.
# external libraries
import fitz
from datetime import datetime
import sys
def find_case_number(str):
# 2021 11 08 jcm added code to search more carefully for the case number
# parses a string to look for first alpha-numeric (which is the start of the case number),
# and collecting all alpha-numerics until the next newline ('\n')
# Note: skips over any newlines ('\n') if before the first alpha-numeric
# Note: drops all non-alpha-numerics within the case number, including spaces
start = False
case_number = ""
for i in range(len(str)):
char = str[i]
if (char.isdigit()):
case_number += char
start = True
elif((char >= 'A' and char <= 'Z') or
# could add space character if we don't want to drop spaces inside case number
# (char = ' ' and start == True) or
(char >= 'a' and char <= 'z')):
case_number += char
start = True
elif char == '\n' and start == True:
# stop if we hit a newline after already getting a case number
break
return case_number
def get_lexis_case_numbers(doc):
# Lexis PDF parsing based on exact keyword "Case Number:"
lexis_case_number_list = []
for page_number in range(0,doc.pageCount):
page = doc.loadPage(page_number)
page_text = page.getText("text")
keyword = "Case Number:"
case_number_split = page_text.split(keyword)
for case_number_plus in case_number_split[1:]:
# 2021 11 08 jcm The challenge is to pull the case number out of the text that
# follows the keyword. Sometimes, that text has special characters before the case
# number (like '\n') and sometimes it just has a space. So the start of the case
# number will be the first alpha-numeric character in the string. The case number
# will be all the alpha-numeric characters from that first one to the newline ('\n').
# Note: This excludes all non-alpha-numeric characters, like spaces.
case_number = find_case_number(case_number_plus)
lexis_case_number_list.append(case_number)
return lexis_case_number_list
def get_ciprs_case_numbers(doc):
# CIPRS PDF parsing based on exact keyword "Court Case:"
ciprs_case_number_list = []
for page_number in range(0,doc.pageCount):
page = doc.loadPage(page_number)
page_text = page.getText("text")
keyword = "Court Case:"
case_number_split = page_text.split(keyword)
for case_number_plus in case_number_split[1:]:
case_number = find_case_number(case_number_plus)
ciprs_case_number_list.append(case_number)
return ciprs_case_number_list
def get_lexis_cases_not_in_ciprs(lexis_case_number_list, ciprs_case_number_list):
# compare the case numbers based on last 6 digits
lexis_cases_not_found = []
lexis_cases_six_digits_not_found = []
for lexis_case_number in lexis_case_number_list:
lexis_case_six_digit = lexis_case_number[-6:]
found_match = False
for ciprs_case_number in ciprs_case_number_list:
if lexis_case_six_digit == ciprs_case_number[-6:]:
found_match = True
if found_match == False:
# only report if new case number (based on last-6-digits)
if lexis_case_six_digit not in lexis_cases_six_digits_not_found:
lexis_cases_six_digits_not_found.append(lexis_case_six_digit)
# only report if full case number not already in the list
if lexis_case_number not in lexis_cases_not_found:
lexis_cases_not_found.append(lexis_case_number)
return lexis_cases_not_found
def file_comparison(lexis_file_path, ciprs_file_path, out_file_path):
# write the case numbers found in Lexis but not in CIPRS
lexis_doc = fitz.open(lexis_file_path)
ciprs_doc = fitz.open(ciprs_file_path)
now = datetime.now()
dt_string = now.strftime("%m/%d/%Y %H:%M:%S")
lexis_case_numbers = get_lexis_case_numbers(lexis_doc)
ciprs_case_numbers = get_ciprs_case_numbers(ciprs_doc)
lexis_not_in_ciprs = get_lexis_cases_not_in_ciprs(lexis_case_numbers, ciprs_case_numbers)
with open(out_file_path,"w") as f:
f.write(
"Lexis versus CIPRS v3 (" + dt_string + ")" + "\n" +
"--------------------------------------------"+ "\n" +
"Lexis PDF: " + lexis_file_path + "\n" +
"CIPRS PDF: " + ciprs_file_path + "\n" +
"--------------------------------------------"+ "\n" +
# 2021 11 11 jcm added lexis and ciprs counts to report
str(len(lexis_case_numbers)) + " Lexis cases found" + '\n' +
str(len(ciprs_case_numbers)) + " CIPRS cases found" + '\n' +
'\n'
)
for lexis_case_number in lexis_not_in_ciprs:
f.write("Lexis case " + lexis_case_number + " not in CIPRS" + "\n")
# 2021 11 08 jcm added code to give a warning if keywords were not found
if lexis_case_numbers == []:
f.write("WARNING! Wrong file? Lexis PDF file does not contain any cases (keyword='Case Number:')" + "\n")
if ciprs_case_numbers == []:
f.write("WARNING! Wrong file? CIPRS PDF file does not contain any cases (keyword='Court Case:')" + "\n")
# 2021 11 11 jcm added code to report if no missing cases found
if lexis_not_in_ciprs == []:
f.write("All Lexis PDF case numbers were found in the CIPRS PDF" + "\n")
def main():
# Inputs (default)
lexis_file_path = "Lexis.pdf"
ciprs_file_path = "CIPRS.pdf"
# override defaults if filenames passed through arguments (via drag/drop)
file_paths = sys.argv[1:] # skip first argument, which is the script name
if len(file_paths) == 2:
lexis_file_path = file_paths[0]
ciprs_file_path = file_paths[1]
# Outputs
now = datetime.now()
dt_string = now.strftime("%Y-%m-%d-%H-%M-%S.%f")[:-4]
out_file_path = "In-Lexis-Not-In-CIPRS-" + dt_string + ".txt"
# test to make sure input file paths exist
lexis_file_found = True
try:
doc = fitz.open(lexis_file_path)
except:
lexis_file_found = False
pass
ciprs_file_found = True
try:
doc = fitz.open(ciprs_file_path)
except:
ciprs_file_found = False
pass
if lexis_file_found == False or ciprs_file_found == False:
with open(out_file_path,"w") as f:
f.write("Lexis versus CIPRS (" + dt_string + ")" + "\n" +
"--------------------------------------------"+ "\n")
if lexis_file_found == False:
f.write("Lexis PDF: ERROR: FILE NOT FOUND: " + lexis_file_path + "\n")
if ciprs_file_found == False:
f.write("CIPRS PDF: ERROR: FILE NOT FOUND: " + ciprs_file_path + "\n")
f.write("--------------------------------------------"+ "\n")
return
file_comparison(lexis_file_path, ciprs_file_path, out_file_path)
if __name__ == "__main__":
main()