-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathechelogParser.py
68 lines (55 loc) · 3.13 KB
/
echelogParser.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
# Utility to Parse HTML IRC logs from echelog.com into plaintext
# Copyright (C) 2021 J. Madgwick
# This program is free software: you can redistribute it and/or modify
# it under the terms of the GNU Affero General Public License as published
# by the Free Software Foundation, either version 3 of the License, or
# (at your option) any later version.
# This program is distributed in the hope that it will be useful,
# but WITHOUT ANY WARRANTY; without even the implied warranty of
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
# GNU Affero General Public License for more details.
# You should have received a copy of the GNU Affero General Public License
# along with this program. If not, see <https://www.gnu.org/licenses/>.
from bs4 import BeautifulSoup, SoupStrainer
import os,locale,argparse
from datetime import datetime
from alive_progress import alive_bar
parser = argparse.ArgumentParser(description='Simple utility to parse a directory of Echelog HTML IRC chat log files, extract the IRC chat logs and store them in a single file using a standard format. By default, ISO format is used for the timestamp (eg. "2011-02-13T01:55:34").')
parser.add_argument('directory', help='Path to directory containing Echelog HTML files')
parser.add_argument('-l', '--local', action='store_true', help='Use local locale based timestamp instead of ISO format')
parser.add_argument('-a', '--all', action='store_true', help='Include all IRC messages. By default, only chat message are included and not join/part etc. Note that Echelog used different formats for these over the years')
parser.add_argument('output', type=argparse.FileType('w'), help='File to write output into')
args = parser.parse_args()
if args.local:
#Set locale to local locale
locale.setlocale(locale.LC_ALL, '')
dateTimeFormat = '%c'
else:
dateTimeFormat = '%Y-%m-%dT%H:%M:%S'
files = os.listdir(args.directory)
#Filter out non html files and remove extension to allow easy sorting
pages = [fn[:-5] for fn in files if fn[-5:] == '.html']
#Sort logs into date order
pages.sort()
#Filter to parse only the <pre> tag, not the rest of the document - this improves performance
preFilter = SoupStrainer("pre")
with alive_bar(len(pages), title='Processing HTML files', spinner='classic') as bar:
for webpage in pages:
#Open file and parse HTML, use lxml to further improve performance
with open(args.directory + '/' + webpage + '.html') as doc:
soup = BeautifulSoup(doc, 'lxml', parse_only=preFilter)
bar()
bar.text('[File: '+ webpage + ']')
ircText = soup#.body.pre
ircLines = []
for line in ircText.find_all('div'):
time = str(line.contents[0])[1:9]
msgType = line.contents[1]['class'][0]
message = line.contents[1].get_text()
#Skip blank messages
if message != '':
ircLines.append([time,msgType,message])
for line in ircLines:
if line[1] == 'd' or args.all:
args.output.write(datetime.strptime(webpage[:10] + line[0],'%Y-%m-%d%H:%M:%S').strftime(dateTimeFormat) + ' ' + line[2] + '\n')
args.output.close()