-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathTokenizer.py
139 lines (127 loc) · 4.76 KB
/
Tokenizer.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
# -*- coding: utf-8 -*-
"""
Created on Fri Dec 01 21:32:53 2017
@author: Carson
"""
import re
keywords = {'class' : "CLASS",
'constructor' : "CONSTRUCTOR",
'function' : "FUNCTION",
'method' : "METHOD",
'field' : "FIELD",
'static' : "STATIC",
'var' : "VAR",
'int' : "INT",
'char' : "CHAR",
'boolean' : "BOOLEAN",
'void' : "VOID",
'true' : "TRUE",
'false' : "FALSE",
'null' : "NULL",
'this' : "THIS",
'let' : "LET",
'do' : "DO",
'if' : "IF",
'else' : "ELSE",
'while' : "WHILE",
'return' : "RETURN"}
symbols = {'{','}','(',')','[',']',
'.',',',';','+','-','*',
'/','&','|','<','>','=',
'~'}
stringConst = '\".+\"'
identifier = '^[a-zA-Z0-9_]+'
def isValidConstant(num):
try:
int(num)
return True
except ValueError:
return False
if num > -1 and num < 32768:
return True
else:
return False
class JackTokenizer():
def __init__(self, instream):
self.currToken = ""
self.curritr = 0
self.filename = instream
self.filename = self.filename.replace("jack", "xml")
self.ostream = open(self.filename, mode = 'w')
self.stream = open(instream, mode = 'r')
self.filecont = ["<tokens>\n"]
self.commentFlag = False
for line in self.stream:
line = line.split("//")[0]
if(line.find("/**") != -1):
if(line.find("*/") != -1):
tempLine = line
tempLine = tempLine.split("*/")[-1]
line = line.split("/**")[0] + tempLine
else:
line = line.split("/**")[0]
self.commentFlag = True
if(line.find("*/") != -1):
self.commentFlag = False
line = line.split("*/")[-1]
if(self.commentFlag and (line.find("*") != -1)):
line = ""
if(line.find("\"") != -1):
tempLine = line
line = []
i = 0
j = 0
i = tempLine.find("\"", 0)
j = tempLine.find("\"", i+1)
line += tempLine[0:i].split()
line.append(tempLine[i:(j+1)])
line += tempLine[(j+1):].split()
else:
line = line.split()
for char in line:
sym_split = "{|}|\(|\)|\[|\]|\.|,|;|\+|-|\*|/|&|\||<|>|=|~"
self.filecont += re.split("(" + sym_split + ")", char)
self.filecont = [word for word in self.filecont if word not in ["",'']]
for element in self.filecont:
self.ostream.write(element+ "\n")
def hasMoreTokens(self):
if self.curritr < len(self.filecont):
return True
else:
return False
def advance(self):
if self.hasMoreTokens():
self.currToken = self.filecont[self.curritr]
self.curritr += 1
return self.evalToken()
return
def tokenType(self):
token = self.currToken
if token in keywords:
return "KEYWORD"
if token in symbols:
return "SYMBOL"
if isValidConstant(token):
return "INT_CONST"
if re.findall(identifier, token):
return "IDENTIFIER"
if re.findall(stringConst, token):
return "STRING_CONST"
def evalToken(self):
token = self.currToken
if self.tokenType() == "KEYWORD":
return "<keyword> " + self.currToken + " </keyword>\n"
if self.tokenType() == "SYMBOL":
token = token.replace("&", "&")
token = token.replace("\"", """)
token = token.replace(">", ">")
token = token.replace("<", "<")
return "<symbol> " + token + " </symbol>\n"
if self.tokenType() == "IDENTIFIER":
return "<identifier> " + token + " </identifier>\n"
if self.tokenType() == "INT_CONST":
return "<integerConstant> " + token + " </integerConstant>\n"
if self.tokenType() == "STRING_CONST":
token = token.replace("\"", "")
return "<stringConstant> " + token + " </stringConstant>\n"
print "error! TokenType not properly handled. token: " + token