forked from derv82/werdy
-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathsort.py
executable file
·1076 lines (859 loc) · 34.8 KB
/
sort.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
592
593
594
595
596
597
598
599
600
601
602
603
604
605
606
607
608
609
610
611
612
613
614
615
616
617
618
619
620
621
622
623
624
625
626
627
628
629
630
631
632
633
634
635
636
637
638
639
640
641
642
643
644
645
646
647
648
649
650
651
652
653
654
655
656
657
658
659
660
661
662
663
664
665
666
667
668
669
670
671
672
673
674
675
676
677
678
679
680
681
682
683
684
685
686
687
688
689
690
691
692
693
694
695
696
697
698
699
700
701
702
703
704
705
706
707
708
709
710
711
712
713
714
715
716
717
718
719
720
721
722
723
724
725
726
727
728
729
730
731
732
733
734
735
736
737
738
739
740
741
742
743
744
745
746
747
748
749
750
751
752
753
754
755
756
757
758
759
760
761
762
763
764
765
766
767
768
769
770
771
772
773
774
775
776
777
778
779
780
781
782
783
784
785
786
787
788
789
790
791
792
793
794
795
796
797
798
799
800
801
802
803
804
805
806
807
808
809
810
811
812
813
814
815
816
817
818
819
820
821
822
823
824
825
826
827
828
829
830
831
832
833
834
835
836
837
838
839
840
841
842
843
844
845
846
847
848
849
850
851
852
853
854
855
856
857
858
859
860
861
862
863
864
865
866
867
868
869
870
871
872
873
874
875
876
877
878
879
880
881
882
883
884
885
886
887
888
889
890
891
892
893
894
895
896
897
898
899
900
901
902
903
904
905
906
907
908
909
910
911
912
913
914
915
916
917
918
919
920
921
922
923
924
925
926
927
928
929
930
931
932
933
934
935
936
937
938
939
940
941
942
943
944
945
946
947
948
949
950
951
952
953
954
955
956
957
958
959
960
961
962
963
964
965
966
967
968
969
970
971
972
973
974
975
976
977
978
979
980
981
982
983
984
985
986
987
988
989
990
991
992
993
994
995
996
997
998
999
1000
#!/usr/bin/python
# -*- coding: iso-8859-15 -*-
"""
(C) Derv Merkler 2011
What this script does:
1) reads from 1 or many text files
2) sorts all lines in the file
3) removes duplicates (if any are found)
4) saves the sorted/dupe-killed lines to a new file
// DONE:
* "file kill" option to delete input files after they are read.
results in only 1X extra file space used instead of 2X.
* warning about making disk space available for output file *AND* chunks
ex: sorting 2GB word of wordlists requires 4GB free space.
* "convert case" option to possible option to "save as lowercase/upper/FirstLetter"
this would save items in the list as lowercase rather than saving the item AND saving the lower-case equivalent.
* allow creation of similar words (lcase, UCASE, First)
duplicates will most likely be created.
the disk space used could multiply for each case type used
ex: using lcase and ucase may cause the disk space used to be 3x original file size.
// TODO:
* dragging-dropping files makes CWD = C:\Windows\system32.. which is bad
we don't want to save sort.txt to system32...
* alter 'freq sort' so that when the max chunk size is reached,
it removes all elements of frequency freq_min or less, and increments freq_min += 1
* create copy of elite text
* convert text to 'elite'
* specify that '-w' only removes whitespace characters AT THE BEGINNING/END of the line!!
update help() to show this.
* stats on # of non-printing characters removed.
* add support for mac-formatted EOL files
(probably not gonna happen - .readline() does not work well w/ mac files)
"""
import sys # for command-line arguments
import os # for file-system purposes
import time # for calculating running time
import heapq # for merging text files
import tempfile # for creating temporary directory
import math # calculating logarithms
import platform # for identifying operating system
import ctypes # for grabbing disk free space (Windows)
from glob import glob # for grabbing files from directory via wildcards
# stripping non-printing characters
import unicodedata, re
control_chars = ''.join(map(unichr, range(0,32) + range(127,160)))
control_char_re = re.compile('[%s]' % re.escape(control_chars))
strip_nonprinting = True # default to stripping the characters
# this is slightly slower but definitely useful and likely to be used for password lists
strip_whitespace = True # remove leading/trailing whitespace characters of each line
outfile = 'sort.txt' # file to save to (changable via -o switch)
length_min = 0 # -m
length_max = 0 # -M
remove_dupes = True # -d
# size to split the output file into (output files will not exceed this number of bytes)
split_size = 1024 * 1024 * 1024 * 1 # 1GB
# size to split input files into (use a smaller number for less RAM usage)
chunk_size = 1024 * 1024 * 75 # 75MB
update_every = 25000 # every ___ words to update percentages
left_sep = '' # left separator
right_sep = '' # right separator
delete_string = [] # lines containing these strings will be skipped
# number of words removed and why
words_removed_short = 0
words_removed_long = 0
words_removed_dupe = 0
words_removed_blank = 0
words_removed_del = 0
rjust = 28 # column length for file names. used for right-justification
njust = 13 # column length for word totals.
convert_case = '' # case to convert to, lcase, ucase, first
# make-a-copy flags for different cases
copy_lower = False
copy_upper = False
copy_first = False
copy_elite = False
delete_input = False # delete the input file(s) after data is parsed.
# temporary directory
temp_dir = tempfile.mkdtemp(prefix='sort')
if not temp_dir.endswith(os.sep): temp_dir += os.sep
# frequency variables
freq_flag = False # sort list by how frequently the lines appeared (most frequent first)
freq_show_count = False # display the number of times a word appeared
freq_min = 0 # minimum times a word must appear to be counted as 'frequent'
# input: a word, a case (lcase, upper, first, etc)
# output: the word, in the given case.
# ex: convert("oh hi", 'first') returns "Oh hi"
def convert(word, case):
if case == 'lcase' or case == 'lower':
return word.lower()
elif case == 'ucase' or case == 'upper' or len(word) <= 1 and case == 'first':
return word.upper()
elif case == 'first':
return word[0].upper() + word[1:].lower()
return word
# generator for leet speak text
def leetify(text):
leet=[['a','4','@'],
['b','8'],
['c','('],
['d'],
['e','3'],
['f'],
['g','9'],
['h','#'],
['i','1','!'],
['l','7','1'], # '!',
['m','nn'],
['o','0'], # '()',
['s','5','$'], # ,'z'
['t','7','+'],
['w'], # ,'vv'
['z','2']]
list = [0] * len(text)
count = [0] * len(text)
for i, letter in enumerate(text):
letter = letter.lower()
found = False
for l in leet:
if letter == l[0]:
found = True
list[i] = l
count[i] = min(1, len(list[i]) - 1)
break
if not found:
list[i] = [letter]
count[i] = 0
count[len(text) - 1] = 0
ind = len(count) - 1
while ind >= 0:
count[ind] += 1
if count[ind] >= len(list[ind]):
count[ind] = min(1, len(list[ind]) - 1)
ind -= 1
else:
ind = len(text) - 1
s = ''
for x in xrange(0, len(list)):
s += list[x][count[x]]
yield s
# first half of the sorting process
# input: list of files
# output: list of filenames containing all data from input files,
# each file is sorted in non-decreasing order,
# each file is at most of size chunk_size, and has all invalid lines removed.
def split(files):
global length_min, length_max, split_size, remove_dupes, update_every
global words_removed_short, words_removed_long, words_removed_blank, words_removed_del
global left_sep, right_sep, delete_string, strip_whitespace, convert_case
global copy_lower, copy_upper, copy_first, copy_elite, strip_nonprinting
split_file = 0 # current chunk filename we are creating
bytes_cur = 0 # current amount of bytes in this chunk (not to exceed chunk_size)
lst = [] # contains lines of current chunk
words_total, words_kept, files_loaded= 0, 0, 0 # counters
still_sorted = True
for file in files: # iterate over every file in input
if not os.path.exists(file):
print '\n file not found: "' + file + '"',
sys.stdout.flush()
continue
elif not os.path.isfile(file):
print '\n skipping directory: "' + file + '"',
sys.stdout.flush()
continue
print ''
files_loaded += 1
words_in_file, words_created = 0, 0
file_cur, file_len = 0, os.path.getsize(file) # current/total bytes in this file
last = ''
i = open(file, 'rb')
line = i.readline()
while line:
if words_total % update_every == 0: # only print out every so often so we are no encumbered by std out
print '\r loading%s, words: %s (%.2f%%) ' % (filename(file).rjust(rjust), format(words_in_file + words_created, ',d').rjust(njust), (100 * float(file_cur) / float(file_len))),
sys.stdout.flush()
words_in_file += 1
file_cur += len(line) # update current location in this file
if strip_nonprinting: # check for non-printing stripping
line = control_char_re.sub('', line)
if strip_whitespace: # check for white-space stripping
line = line.strip()
elif line.endswith('\n') != -1:
line = line.replace('\n', '')
save = True # flag, tells us if line should be saved
# check for left-separator option
if left_sep != '':
if line.rfind(left_sep) == -1:
save = False
else:
line = line[:line.rfind(left_sep)]
# check for right-separator option
if right_sep != '':
if line.find(right_sep) == -1:
save = False
else:
line = line[line.find(right_sep) + len(right_sep):]
# check for delete-string option
for de in delete_string:
if line.find(de) != -1:
save = False
words_removed_del += 1
break
# check for blank-line
if len(line) == 0:
save = False
words_total -= 1
words_removed_blank += 1
# minimum length option
elif length_min > 0 and len(line.replace('\n','')) < length_min:
save = False
words_removed_short += 1
# maximum length option
elif length_max > 0 and len(line.replace('\n','')) > length_max:
save = False
words_removed_long += 1
if save: # if we are still supposed to save it, do it
bytes_cur += len(line) + 1
lst.append(line)
words_kept += 1
if still_sorted and cmp(last, line) > 0:
still_sorted = False
# make copies in diff cases (if needed)
if copy_lower and line.lower() != line:
if still_sorted: still_sorted = False
lst.append(line.lower())
bytes_cur += len(line) + 1
words_created += 1
if copy_upper and line.upper() != line:
if still_sorted: still_sorted = False
lst.append(line.upper())
bytes_cur += len(line) + 1
words_created += 1
if copy_first:
temp = convert(line, 'first')
if temp != line:
if still_sorted: still_sorted = False
lst.append(temp)
bytes_cur += len(temp) + 1
words_created += 1
if copy_elite:
if still_sorted: still_sorted = False
for elite_perm in leetify(line):
if words_created % update_every == 0:
print '\r loading%s, words: %s (%.2f%%) ' % (filename(file).rjust(rjust), format(words_in_file + words_created, ',d').rjust(njust), (100 * float(file_cur) / float(file_len))),
sys.stdout.flush()
lst.append(elite_perm)
bytes_cur += len(elite_perm) + 1
words_created += 1
if bytes_cur >= chunk_size:
print '\r loading%s, words: %s *sorting*' % (filename(file).rjust(rjust), format(words_in_file + words_created, ',d').rjust(njust)),
sys.stdout.flush()
lst.sort() # sort the chunk
still_sorted = True
print '\r loading%s, words: %s *chunking*' % (filename(file).rjust(rjust), format(words_in_file + words_created, ',d').rjust(njust)),
sys.stdout.flush()
o = open(temp_dir + str(split_file), 'wb')
for item in lst: # save the chunk
o.write(item + '\n')
o.close()
del lst[:]
split_file += 1 # increment the current chunk file
bytes_cur = 0
print '\r loading%s, words: %s (%.2f%%) ' % (filename(file).rjust(rjust), format(words_in_file, ',d').rjust(njust), (100 * float(file_cur) / float(file_len))),
sys.stdout.flush()
words_total += 1
# check if we have exceeded the chunk size while iterating over this file
if bytes_cur >= chunk_size:
print '\r loading%s, words: %s *sorting*' % (filename(file).rjust(rjust), format(words_in_file + words_created, ',d').rjust(njust)),
sys.stdout.flush()
if not still_sorted: # the list is not already sorted
lst.sort() # sort the chunk
still_sorted = True
print '\r loading%s, words: %s *chunking*' % (filename(file).rjust(rjust), format(words_in_file + words_created, ',d').rjust(njust)),
sys.stdout.flush()
o = open(temp_dir + str(split_file), 'wb')
for item in lst: # save the chunk
o.write(item + '\n')
o.close()
del lst[:]
split_file += 1 # increment the current chunk file
bytes_cur = 0
print '\r loading%s, words: %s ' % (filename(file).rjust(rjust), format(words_in_file + words_created, ',d').rjust(njust)),
sys.stdout.flush()
last = line
line = i.readline()
i.close()
print '\r loaded %s, words: %s ' % (filename(file).rjust(rjust), format(words_in_file + words_created, ',d').rjust(njust)),
sys.stdout.flush()
if delete_input:
os.remove(file)
print '\r loaded %s, words: %s *deleted* ' % (filename(file).rjust(rjust), format(words_in_file + words_created, ',d').rjust(njust)),
sys.stdout.flush()
# at this point, we are done extracting data from the list of input files
if files_loaded == 0:
print '\n no files found! exiting'
sys.exit(0)
# check if there are still items left in the chunk to be saved
if len(lst) > 0:
print '\r loaded %s, words: %s *sorting*' % (filename(file).rjust(rjust), format(words_in_file + words_created, ',d').rjust(njust)),
sys.stdout.flush()
if not still_sorted: # the list is not already sorted
lst.sort() # sort the chunk
print '\r loaded %s, words: %s *chunking*' % (filename(file).rjust(rjust), format(words_in_file + words_created, ',d').rjust(njust)),
sys.stdout.flush()
o = open(temp_dir + str(split_file), 'wb')
o.write('\n'.join(lst)) # save the chunk
o.close()
del lst[:]
split_file += 1
print '\r loaded %s, words: %s ' % (filename(file).rjust(rjust), format(words_in_file + words_created, ',d').rjust(njust)),
sys.stdout.flush()
if delete_input:
#os.remove(file)
print '\r loaded %s, words: %s *deleted* ' % (filename(file).rjust(rjust), format(words_in_file + words_created, ',d').rjust(njust)),
sys.stdout.flush()
print '\n ' + ('%d files' % (files_loaded)).rjust(rjust) + ', total: %s words' % (format(words_kept + words_created, ',d')).rjust(njust)
result = []
for x in xrange(0, split_file):
result.append(temp_dir + str(x))
return result # return list of filenames containing the sorted chunks
# generator for lines in a file. used by the 'merge' function
def file_gen(file):
f = open(file, 'rb')
line = f.readline()
while line:
yield line
line = f.readline()
f.close()
os.remove(file)
# merge sorted chunks into one file
# input: 'chunks': list of filenames; each file is sorted
# 'file' : filename to save list to
# result: all data contained in 'chunks' is saved to 'file', maintaining sorted order
# 'file' size will not exceed split_size (bytes)
def merge(chunks, file):
global update_every, words_removed_dupe
outfile = open(file, 'wb')
saving_to = file # current file we are writing to (subject to change as size exceeds split_size)
bytes_so_far, bytes_cur, count, split_file = 0, 0, 0, 0 # counters for bytes/lines
# total number of bytes for all input files (awesome one-liner!)
bytes_ttl = sum(os.path.getsize(str(chunk)) for chunk in chunks)
words_in_file, words_kept, last = 0, 0, '' # word counters
# frequency counters
freq_count, freq_list = 1, []
# we get to use the beautiful heapq.merge() method.
# it will take our input files and grab the next-smallest word from each file! it's amazing!
for line in heapq.merge(*(file_gen(f) for f in chunks)):
if count % update_every == 0:
if freq_flag:
print '\r counting words: %s (%.3f%%)' % (format(words_in_file, ',d').rjust(njust), 100 * float(bytes_cur) / float(bytes_ttl)),
sys.stdout.flush()
else:
print '\r saving ' + filename(saving_to).rjust(rjust) + ', words: %s (%.3f%%)' % (format(words_in_file, ',d').rjust(njust), 100 * float(bytes_cur) / float(bytes_ttl)),
sys.stdout.flush()
bytes_cur += len(line)
if bytes_so_far + len(line) >= split_size:
outfile.close()
print '\r saved ' + filename(saving_to).rjust(rjust) + ', words: %s (%s) ' % (format(words_in_file, ',d').rjust(njust), inttosize(bytes_so_far))
split_file += 1
c = file.rfind('.')
if c == -1: c = len(file) - 1
saving_to = file[:c] + '-' + str(split_file) + '' + file[c:]
outfile = open(saving_to, 'wb')
words_in_file = 0
bytes_so_far = 0
if not remove_dupes or line != last:
if freq_flag:
if freq_count >= freq_min:
freq_list.append((freq_count, last))
# ensure the sorted list is not too big
if sys.getsizeof(freq_list) > chunk_size:
freq_list.sort(reverse=True) # reverse sort
while sys.getsizeof(freq_list) > chunk_size:
freq_list.pop(len(freq_list) - 1) # remove least-frequently-occuring element.
freq_count = 1
else:
outfile.write(line)
bytes_so_far += len(line)
last = line
words_in_file += 1
words_kept += 1
elif freq_flag:
freq_count += 1
pass
else:
words_removed_dupe += 1
count += 1
# calculating frequency of words, sorting, saving, etc
if freq_flag and len(freq_list) > 0:
print '\r counting words: %s (%.3f%%)' % (format(words_in_file, ',d').rjust(njust), 100 * float(bytes_cur) / float(bytes_ttl)),
print '\n frequency sort: *sorting*...',
sys.stdout.flush()
freq_list.sort(reverse=True)
print '\r frequency sort: *saving* to %s...' % (file),
outfile = open(file, 'wb')
just = len(str(freq_list[0][0]))
if freq_show_count:
outfile.write('#'.rjust(just) + ' word\n')
outfile.write('%s %s\n' % ('-' * just, '-'* len(freq_list[0][1])))
bytes_so_far = 0
for item in freq_list:
if freq_show_count:
bytes_so_far += len(str(item[0]).rjust(just) + ' ' + item[1])
outfile.write(str(item[0]).rjust(just) + ' ' + item[1])
else:
bytes_so_far += len(item[1])
outfile.write(item[1])
outfile.close()
print '\r frequency sort: saved to %s (%s)' % (file, inttosize(bytes_so_far))
elif freq_flag:
print '\n frequency sort: no lines occurred more than %s times' % (freq_min)
else:
print '\r saved ' + filename(saving_to).rjust(rjust) + ', words: %s (%s) ' % (format(words_in_file, ',d').rjust(njust), inttosize(bytes_so_far))
print '\n total words saved: %s' % (format(words_kept, ',d'))
outfile.close()
# extract the filename from a path
def filename(path):
s = path.rfind(os.sep)
if s == -1:
result = path
else:
result = path[s+1:]
if len(result) < rjust:
return result
mid = int(rjust / 2)
return result[:mid-1] + '...' + result[2+len(result)-mid:]
# converts string representation of a filesize to int (bytse)
# ex: sizetoint('1mb') returns 1048576
def sizetoint(size):
snum = ''
for char in size:
if char.isdigit() or char == '.':
snum += char
else:
break
num = float(snum)
chars = size[len(snum):].strip().lower()
chars = chars.replace('b','')
if chars == 't':
num = num * 1024 * 1024 * 1024 * 1024
elif chars == 'g':
num = num * 1024 * 1024 * 1024
elif chars == 'm':
num = num * 1024 * 1024
elif chars == 'k':
num = num * 1024
return int(num)
# converts numeric bytes into human-readable format
def inttosize(bytes):
b = 1024 * 1024 * 1024 * 1024
a = ['t','g','m','k','']
for i in a:
if bytes >= b:
return '%.2f%sb' % (float(bytes) / float(b), i)
b /= 1024
return '0b'
# converts seconds to human-readable format
# ex: sectotime(3661) returns '1h 1m 1.00s'
def sectotime(sec):
result = ''
if sec > 3600:
result = '%dh ' % (sec / 3600)
sec %= 3600
if sec > 60:
result += '%dm ' % (sec / 60)
sec %= 60
result += str('%.2fs' % sec)
return result
# print help screen
def help():
print ' this script sorts [and can clean] wordlists.'
print ''
print ' usage: python sort.py [OPTIONS] [FILE(S)]'
print ''
print ' OPTIONS:'
print " -o $ output file sorted data is written to. default: sort.txt"
print " -m # minimum line length. lines shorter than # will be removed"
print " -M # maximum line length. lines longer than # will be removed"
print ""
print " -d do NOT remove duplicates. default: remove duplicate lines"
print ""
print " -s $ split output file into segments of size $"
print " ex: -s 100mb splits output into 100 megabyte chunks"
print " ex: -s 0.9gb splits output into 0.9 gigabyte chunks"
print " works with prefixes: [b], kb, mb, gb, or tb. default:", inttosize(split_size)
print ""
print " -c # chunk size. this decides how much data is loaded into memory"
print " smaller is safer, but slower. default:", inttosize(chunk_size)
print ""
print " -freq sort the list based on how frequently the lines appear"
print " list will be ordered from most-frequent to least-frequent"
print " -freqn same as -freq, but also display # of times each line appeared"
print " the output will resemble a table instead of a list of words"
print ""
print " -l $ in each line, extract text to the left of $"
print " -r $ in each line, extract text to the right of $"
print " note: if separator is not found, line is ignored (removed)"
print ""
print " -x $ ignore (do not save) line if it contains $. (case sensitive)"
print " note: you can use -x multiple times if desired"
print ""
print " -np DON'T strip non-printing characters from each line"
print " -w DON'T strip leading/trailing whitespace characters from each line"
print ""
print " -kill deletes all input files as they are parsed."
print " this is useful if you want to free disk space during the sort"
print " WARNING: this will PERMANENTLY DELETE ALL FILES you pass to it"
print " this option only requires 1X input size free space instead of 2X"
print ""
print " -convert $ convert lines to $ case: lower, upper, first, elite."
print " ex: to convert list to lower-case, type: -convert lower"
print ""
print " -lower save copies of each line as lower-case"
print " -upper save a copy of each line as UPPER-CASE"
print " -first save a copy of each line as First Letter Capitalized"
print " -elite save a copy of each line as elite (31337)"
print ""
print " example: python sort.py -m 8 -M 63 *.txt"
# return amount of free space (in bytes) available on the drive
# cross-platform!
def get_free_space(folder):
if platform.system() == 'Windows':
free_bytes = ctypes.c_ulonglong(0)
ctypes.windll.kernel32.GetDiskFreeSpaceExW(ctypes.c_wchar_p(folder), None, None, ctypes.pointer(free_bytes))
return free_bytes.value
else:
oss = os.statvfs(folder)
return oss.f_bfree * oss.f_bsize
# return path to this file (sort.py)
def get_script_path():
p = os.path.realpath(__file__)
p = p[:p.rfind(os.sep)+1]
return p
# ensures there is enough space in the temp folder and cwd for the files
# exits with error message if there may not be enough space
def filesize_check(files):
total = sum(os.path.getsize(file) for file in files)
grand_total = total
delete_multiplier = 2
if delete_input:
delete_multiplier = 1
if copy_lower:
grand_total += total
if copy_upper:
grand_total += total
if copy_first:
grand_total += total
cdir = os.getcwd()
if platform.system() == 'Windows':
cdir = cdir[:cdir.find(os.sep)+1]
cwd_free = get_free_space(cdir)
tdir = temp_dir
if platform.system() == 'Windows':
tdir = tdir[:tdir.find(os.sep)+1]
tmp_free = get_free_space(tdir)
not_enough = False
if cwd_free == tmp_free:
# cwd and tmp are the same disk
grand_total * delete_multiplier
if cwd_free < (grand_total):
not_enough = True
print ' * not enough space on disk ' + cdir
print '\n the program may require up to %s free space' % (inttosize(grand_total))
print ' free space available on %s: %s' % (cdir, inttosize(cwd_free))
else:
# cwd and tmp are separate disks
if tmp_free < grand_total:
not_enough = True
print ' * not enough space on disk: ' + tdir
print ' the program may require up to %s free space' % (inttosize(grand_total))
print ' free space available on ' + tdir + ': %s' % (inttosize(tmp_free))
elif not delete_input and cwd_free < grand_total:
not_enough = True
print ' * not enough space on disk: ' + cdir
print '\n the program may require up to %s free space' % (inttosize(grand_total))
print ' free space available on ' + cdir + ': %s' % (inttosize(cwd_free))
elif delete_input and cwd_free < grand_total - total:
not_enough = True
print ' * not enough space on disk: ' + cdir
print ' the program may require up to %s free space' % (inttosize(grand_total - total))
print '\n free space available on ' + cdir + ': %s' % (inttosize(cwd_free))
if not_enough:
sys.exit(0)
# checks if 'file' and permutations of 'file' already exist,
# prompts the user to delete them if they do, otherwise outputs nothing.
def output_check(file):
exists = []
if os.path.exists(file):
exists.append(file)
c = file.rfind('.')
if c == -1: c = len(file) - 1
i = 1
while os.path.exists(file[:c] + '-' + str(i) + file[c:]):
exists.append(file[:c] + '-' + str(i) + file[c:])
i += 1
if len(exists) > 0:
if len(exists) == 1:
print ' the following file already exists and may be overwritten:'
else:
print ' the following files already exist and may be overwritten:'
for i, e in enumerate(exists):
if i < 4 or i > len(exists) - 3:
print ' * ' + e
else:
print '.',
if len(exists) == 1:
print ' do you want to delete this file? (Y/N):',
else:
print ' do you want to delete these files? (Y/N):',
x = raw_input().strip().lower()
if x == '' or x[0] != 'y':
print ''
print ' you can specify the output file with the -o switch'
print ' ex: python sort.py -o new_file.txt *.txt'
sys.exit(0)
for e in exists:
os.remove(e)
# ensure the user wants to delete every input file before generating the output file
def delete_check(files):
if delete_input:
print '\n ************ WARNING ***************'
print ''
if len(files) > 1:
print ' you have chosen to delete input files after they have been read'
else:
print ' you have chosen to delete the input file after it has been read'
print ' this will PERMANENTLY DELETE the following file(s):'
for f in files:
print ' * ' + f
print ' are you sure that you want to do this? (Y/N)',
x = raw_input().lower()
if x != 'y':
print '\n exiting. run without the -kill option to avoid this warning'
sys.exit(0)
print '\n no, seriously, are you sure? (Y/N)',
x = raw_input().lower()
if x != 'y':
print '\n exiting. run without the -kill option to avoid this warning'
sys.exit(0)
# parse command-line arguments, set global variables
def parse_args(args):
global length_min, length_max, remove_dupes
global split_size, chunk_size, outfile
global left_sep, right_sep, delete_string, strip_whitespace
global freq_flag, freq_show_count, delete_input
global convert_case, copy_lower, copy_upper, copy_first, copy_elite
global strip_nonprinting
result, i = [], 0
printed = False
while i < len(args):
x = args[i]
if x == '-m' or x == '-min' or x == '--min':
if i == len(args) - 1:
print ' minimum length required!'
sys.exit(0)
length_min = int(args[i+1])
printed = True
print ' * minimum word length set: ' + str(length_min)
i += 1
elif x == '-M' or x == '-max' or x == '--max':
if i == len(args) - 1:
print ' maximum length required!'
sys.exit(0)
length_max = int(args[i+1])
printed = True
print ' * maximum word length set: ' + str(length_max)
i += 1
elif x == '-o' or x == '--output':
if i == len(args) - 1:
print ' output file name required!'
sys.exit(0)
outfile = args[i+1]
print ' * output file set: "' + outfile + '"'
i += 1
elif x == '-d' or x == '--duplicates':
remove_dupes = False
printed = True
print ' * duplicate removal disabled'
elif x == '-s' or x == '--split':
if i == len(args) - 1:
print ' split size (in bytes) required!'
sys.exit(0)
split_size = sizetoint(args[i+1])
printed = True
print ' * file split size set: ' + format(split_size, ',d') + ' bytes'
i += 1
elif x == '-c' or x == '--chunk':
if i == len(args) - 1:
print ' chunk size (in bytes) required!'
sys.exit(0)
chunk_size = sizetoint(args[i+1])
printed = True
print ' * chunk size set:', format(chunk_size, ',d'), 'bytes'
i += 1
elif x == '-l' or x == '--left':
if i == len(args) - 1:
print ' separating string is required!'
sys.exit(0)
left_sep = args[i+1]
printed = True
print ' * left-separator set:', left_sep
i += 1
elif x == '-r' or x == '--right':
if i == len(args) - 1:
print ' separating string is required!'
sys.exit(0)
right_sep = args[i+1]
printed = True
print ' * right-separator set:', right_sep
i += 1
elif x == '-x' or x == '--skip':
if i == len(args) - 1:
print ' deletion string is required!'
sys.exit(0)
delete_string.append(args[i+1])
printed = True
print ' * deletion string added:', args[i+1]
i += 1
elif x == '-help' or x == '/?' or x == '--help' or x == '-h' or x == '?':
help()
sys.exit(0)
elif x == '-wpa' or x == '--wpa':
length_min = 8
length_max = 63
remove_dupes = True
print ' *** WPA-PSK MODE ***'
print ' * minimum word length set: ' + str(length_min)
print ' * maximum word length set: ' + str(length_max)
print ' * duplicate removal enabled'
printed = True
elif x == '-w' or x == '--no-strip':
strip_whitespace = False
print ' * whitespace stripping disabled'
printed = True
elif x == '-np' or x == '--non-printing':
strip_nonprinting = False
print ' * non-printing character stripping disabled'
printed = True
elif x == '-freq' or x == '--freq':
freq_flag = True
freq_show_count = False
print ' * sorting by frequency enabled'
printed = True
elif x == '-freqn' or x == '--freq-count':
freq_flag = True
freq_show_count = True
print ' * sorting by frequency enabled, line frequency is included in output'
printed = True
elif x == '-kill' or x == '--delete-input-files':
delete_input = True
elif x =='-convert' or x == '--convert':
if i == len(args) - 1:
print ' convert string case required! ex: lower, upper, first'
sys.exit(0)
convert_case = args[i+1]
printed = True
print ' * convert case set: ', convert(args[i+1], args[i+1])
i += 1
elif x == '-lower' or x =='-lcase':
copy_lower = True
print ' * lower-case copies enabled'
printed = True
elif x == '-upper' or x =='-ucase':
copy_upper = True
print ' * upper-case copies enabled'
printed = True
elif x == '-first' or x =='--first':
copy_first = True
print ' * first-letter-upper copies enabled'
printed = True
elif x == '-elite' or x == '--elite':
copy_elite = True
print ' * elite (leetspeak) mutations enabled'
printed = True
else:
if x.find('*') != -1:
fixed = ''
for letter in x:
if letter == '[':
fixed += '[[]'
elif letter == ']':
fixed += '[]]'
else:
fixed += letter
x = fixed
for item in glob(x):
result.append(item)
elif os.path.isfile(x):
result.append(x)
elif os.path.isdir(x):
if not x.endswith(os.sep):
x += os.sep
for item in glob(x + '*'):
result.append(item)
i += 1
if printed: print ''
return result
# where the magic happens!
def start(args):
global outfile, words_removed_short, words_removed_long, words_removed_blank, words_removed_dupe, words_removed_del, delete_string
print ""
print " / derv's wordlist magic /"
print ""
files = parse_args(args)