-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathfileIterator.py
157 lines (124 loc) · 5.83 KB
/
fileIterator.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
import zipfile
import shutil
import os
import sys
import xml.etree.ElementTree as ElementTree
import tkinter as tk
from tkinter import filedialog
debug = False
def build_window():
keyword = []
def get_input(entry): #function that adds whatever string is entered in the field to the keyword array
keyword.append(entry.widget.get()) #append the word the user entered in a box
entry.widget.config(state="disabled")
if( len(keyword) == int(sys.argv[1]) ):
window.destroy()
window = tk.Tk()
window.title("File Parser for Security Policies")
window.geometry("1000x1000")
for i in range(int(sys.argv[1])):
entry = tk.Entry(window, width=40)
entry.pack() #adds entry filed to the window
entry.bind("<Return>", get_input) #key-binds the return key to call get_input function
intro = tk.StringVar(window, "Quick .docx parser")
intro_label = tk.Label(window, textvariable=intro)
intro_label.pack()
step_one = tk.StringVar(window, "1) Enter a keyword you wish to search for and press the ENTER key and the box will become greyed out")
step_one_label = tk.Label(window, textvariable=step_one)
step_one_label.pack()
step_two = tk.StringVar(window, "2) Repeat for all fields")
step_two_label = tk.Label(window, textvariable=step_two)
step_two_label.pack()
step_three = tk.StringVar(window, "3) A file dialog window will appear, select the files you wish to search. To select multiple files, click and drag the highlight box")
step_three_label = tk.Label(window, textvariable=step_three)
step_three_label.pack()
step_four = tk.StringVar(window, "4) Results of a search for each individual file will be outputted to the terminal")
step_four_label = tk.Label(window, textvariable=step_four)
step_four_label.pack()
window.mainloop() #infinite loop that runs until key return key is entered
if( not keyword ):
return ""
return keyword
#return keyword[0]
def clean(working_directory):
try:
shutil.rmtree(working_directory + '/text_docs')
except:
print("Build failed: No text_docs directory to remove")
def unzip_docx(file_path, working_directory):
try:
docx = zipfile.ZipFile(file_path, 'r')
except:
print(file_path + " is not a .docx file")
clean()
exit()
docx.extract('word/document.xml', working_directory) #extract the xml file containing text contents
dot_location = file_path.find('.')
slash_location = file_path.rfind('/')
file_renamed = working_directory + '/text_docs/' + file_path[slash_location+1:dot_location] + '.xml'
shutil.move(working_directory + '/word/document.xml', working_directory + '/text_docs') #move to home directory
os.rename(working_directory + '/text_docs/document.xml', file_renamed)
os.rmdir(working_directory + '/word') #delete the extracted directory
def get_xml_root(file_path): #(./extracted_info/word/document.xml) path
xml_tree = ElementTree.parse(file_path) #creates an parse tree out out of the xml file passed in
xml_tree_root = xml_tree.getroot() #returns the root of this parse tree
return xml_tree_root
def get_text(xml_tree_root):
nsmap = {'w' : 'http://schemas.openxmlformats.org/wordprocessingml/2006/main' }
all_xml_text = xml_tree_root.findall('.//w:t', nsmap)
text_formatted = [None] * len(all_xml_text) #initialize a second array to hold .text part of these objects
for i in range(len(all_xml_text)):
text_formatted[i] = all_xml_text[i].text
return(text_formatted)
def initParser(home_path):
try:
os.mkdir(home_path + '/' + 'text_docs')
except:
return
def pull_xml(files, working_directory):
for i in range(len(files)): #pull all xml files out
unzip_docx(files[i], working_directory)
def search_for_text(key, files, working_directory):
for i in range(len(files)):
text_instances = []
unique_words_found = []
end_location = files[i].find('.')
file_name = working_directory + '/text_docs/' + files[i][0:end_location] + '.xml'
root = get_xml_root(file_name)
text = get_text(root)
print("Searching " + files[i][0:end_location] + '\n')
for i in range(len(text)):
for k in range(len(key)):
if( text[i].find(key[k]) > 0 ):
if( key[k] not in unique_words_found ):
unique_words_found.append(key[k])
if( text[i] not in text_instances ):
text_instances.append(text[i])
if( len(unique_words_found) == len(key) ):
for i in range(len(text_instances)):
print("instance " + str(i+1) + ":\n" + text_instances[i] + "\n")
else:
print("Not found\n")
def main():
if( not debug ): #code we want to execute
if( len(sys.argv) <= 1 ):
print("ERROR: Please give number of keywords to search for as a command line argument")
exit()
keyword = build_window()
if( keyword == "" ): #error handling to ensure some string was entered
print("ERROR: Please enter a valid keyword to search files for")
exit()
files = filedialog.askopenfilenames()
file_names = [None] * len(files)
for i in range(len(files)):
start_location = files[i].rfind('/')
file_names[i] = files[i][start_location+1:len(files[i])]
dot_location = files[0].rfind('/')
text_docs_home_directory = files[0][0:dot_location]
initParser(text_docs_home_directory)
pull_xml(files, text_docs_home_directory)
search_for_text(keyword, file_names, text_docs_home_directory)
clean(text_docs_home_directory)
if( debug ): #code to test if we want to debug
print(len(sys.argv))
main()