-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathfizzylogo_tokenizer.coffee
375 lines (321 loc) · 13.1 KB
/
fizzylogo_tokenizer.coffee
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
tokenizeCommand = (command) ->
# we can just use the dot instead of 's.
# Using 's would make things more complicated
# because we also use ' as quote symbol
command = command.replace /'[ ]*s[\s]/g, "."
# separate parens
command = command.replace /\(/g, " ( "
command = command.replace /\)/g, " ) "
command = command.replace /\[/g, " [ "
command = command.replace /\]/g, " ] "
# also "," which is used in list literals
# (i.e. the "array form" way of initialising a list)
# separates anything before or after
command = command.replace /,/g, " , "
# this is the statement separator
command = command.replace /;/g, " ; "
# @ as shorthand for "self." or "self", à la coffeescript:
# any @ that is just before something that is not a closing delimiter
# is a field access. Note that there must be no space, so:
#
# @print // is a field access
#
# ...is different from:
#
# @ print // not a field access
#
# any other @ (i.e. next to a space or a closing delimeter) is just
# "self" like in the second case.
command = command.replace /@([^\]);\s])/g, " self . $1"
command = command.replace /@/g, " self "
# separate numbers and punctuation from anything else
# note that:
# 1) numbers and punctuation are dealt together because
# otherwise something like 3.14 is separated into 3 . 14
# so we need to handle them both here so we get that floating
# point case before we separate the dots
#
# 2) consecutive operators like ++, --, +=, **
# ^^ are left untouched, so you can give good meaning to them
# for example this:
# a++ +
# becomes
# a ++ +
# so the spacing between the operators can be significant
#
# see this playground with an example:
# https://regex101.com/r/LRqxeN/3
#
command = command.replace /([0-9]*\.[0-9]+([eE][- ]?[0-9]*)?)|([^+\-^*/<>()\[\]=←.!%])([+\-^*/<>()\[\]=←.!%]+)/g, "$1$3 $4 "
# separates the *end* of a sequence of punctuations from what
# comes after. e.g. 3.14 =a becomes 3.14 = a
# not that we can't/don't need to do that with the dot because
# a) it was already done by the regex above and
# b) it would wreck floating point numbers
command = command.replace /([+\-^*/<>()\[\]=←!%])([^+\-^*/<>()\[\]=←!%])/g, "$1 $2"
# things that are now a / _2 become a /_ 2
# this is so we can have an "underscore" version of
# an operator, in this case floor division
command = command.replace /([+\-^*/<>=←!%_])[ ]+_/g, "$1_ "
# separate the minuses from everything else other than
# the decrement case "--". This is so we can make sense
# of things like "1*-2" like most language can
command = command.replace /([+\-^*/<>()\[\]=←!%])-/g, "$1 -"
command = command.replace /- -/g, "--"
command = command.replace /'/g, " ' "
command = command.replace /:/g, " : "
# quote symbol used by Rosetta Smalltalk
# while we use the standard ' i.e. "quote"
#command = command.replace /@/g, " @ "
# collapse all multiple spaces to one
command = command.replace /[ ]+/g, " "
command = command.trim()
removeComments = (code) ->
# this fizzylogo is empty-line sensitive, this regex is custom
# and also properly removes the lines where there are comments
# (i.e. it doesn't just strip the comment leaving empty the line
# where it was)
#
# 1st part: multi-line comments starting with new line
# 2nd part: multi-line comments NOT starting with new line
# 3rd part: single-line comments starting with new line
# 4th part: single-line comments NOT starting with new line
# 5th part: multi-line comment at the end of the text
# 6th part: single-line comment at the end of the text
#
# see: https://regex101.com/r/U5pYX4/1
return code.replace ///
(^)\/\*[\s\S]*?\*\/\n?|
([^\\\n])\/\*[\s\S]*?\*\/|
(^)\/\/.*$\n|
([^\\\n])\/\/.*$|
\n\/\/.*$|
\n\/\*[\s\S]*?\*\
///gm, "$1$2$3$4"
# replace all the strings with
# something similar to their base64 encoding.
removeStrings = (code) ->
codeWithoutStrings = code.replace(/"((?:[^"\\\n]|\\.)*)"/g, (all, quoted) ->
if DEBUG_STRINGIFICATION_CHECKS
stringsTable_TO_CHECK_CONVERTIONS[ValidIDfromString quoted] = quoted
#log "$STRING_TOKEN_" + (ValidIDfromString quoted) + " i.e." + quoted
return "$STRING_TOKEN_" + (ValidIDfromString quoted)
)
return codeWithoutStrings
# replaces strings and regexs keyed by index with an array of strings
# see "removeStrings" function
injectStrings = (code) ->
code = code.replace /\$STRING_TOKEN_([\$a-zA-Z0-9_]+)/g, (all, index) ->
val = StringFromValidID index
if DEBUG_STRINGIFICATION_CHECKS
if val != stringsTable_TO_CHECK_CONVERTIONS[index]
throw "ERROR cannot get back string from ID, got back: " + StringFromValidID index
#log "INJECTING $STRING_TOKEN_" + index
#log "INJECTING i.e. " + val
return val
return code
# transform indentations into brackets
# it tries to be smart, so you can add "innocuous" extra
# indentation.
# Typical innocuous indentations that is useful to add
# is the kind that transforms things like this:
#
# for each word in
# ﹍myList
# do
# ﹍codeToBeRun eval
# something else
#
# into this:
#
# for each word in
# ﹍﹍myList // still "under the umbrella" of for
# ﹍do // still "under the umbrella" of for
# ﹍﹍codeToBeRun eval // still "under the umbrella" of do
# something else // NOT "under the umbrella" of do anymore
# // BUT it's still "under the umbrella" of
# for so it's OK
#
# basically you can add tabs within reason if you preserve
# the logical alignment of things, i.e. if a line is still
# "under the umbrella of" or "aligned to" the same line
# then it's OK.
#
# It works this way:
# 1) for any jump "inside" of more
# than one tab is corrected to a jump of one tab.
# 2) for any jump outside:
# the current line is checked for the fist line above
# that is either aligned with it or to the left of it.
# Since we keep the correct identation for all
# lines above, we pick the "correct" tab
# of the current line to be the tab of the found
# one above.
# 3) once the "correct" tab of this line is found
# as per 1) and 2), we just add and remove
# () based on the difference in the "correct"
# tab numbers.
linearize = (code) ->
code = code.replace /﹍/g, " "
# remove all empty lines,
# because we really don't want
# to have the length of empty lines to
# be meaningful. That would be really cruel,
# since they are invisible,
# so to avoid any such situation we just remove empty
# lines here so that we are forced to give the user
# alternative ways.
code = code.replace /^\s*[\r\n]/gm, ""
sourceByLine = code.split("\n")
startOfPreviousLine = ""
linesWithBlockStart = []
unclosedParens = 0
outputSource = ""
actualLineTabs = []
correctedLineTabs = []
for eachLine in [0...sourceByLine.length]
line = sourceByLine[eachLine]
#log "checking " + line
rx = RegExp("^([ ]*)",'gm')
match = rx.exec line
if eachLine == 0
outputSource += " " + line
actualLineTabs.push 0
correctedLineTabs.push 0
continue
startOfThisLine = match[1]
#log "start of line: >" + startOfThisLine + "<"
# leftOrRightOrAligned is only used to understand if the current
# line if to the left, to the right or aligned. The actual
# correct difference in indentation is calculated later.
leftOrRightOrAligned = startOfThisLine.length - actualLineTabs[actualLineTabs.length - 1]
actualLineTabs.push startOfThisLine.length
if tokenizerDebug
log "linearize startOfThisLine: " + startOfThisLine + " " + startOfThisLine.length + " difference in alignment: " + leftOrRightOrAligned + " content: " + line
if leftOrRightOrAligned == 0
correctedIndentationDifference = 0
# this is the statement separator
outputSource += " ; " + line
else if leftOrRightOrAligned > 0
correctedIndentationDifference = 1
if tokenizerDebug
log "linearize adding a ( "
outputSource += (Array(correctedIndentationDifference+1).join "(") + line
else # leftOrRightOrAligned < 0
for k in [(actualLineTabs.length - 2)..0]
if tokenizerDebug
log " k: " + k + " checking line " + sourceByLine[k] + " for alignment "
if actualLineTabs[k] <= startOfThisLine.length
if tokenizerDebug
log "line " + sourceByLine[k] + " is aligned with me and the corrected tabs for that were: " + correctedLineTabs[k]
correctedIndentationDifference = correctedLineTabs[k] - correctedLineTabs[correctedLineTabs.length - 1]
break
if tokenizerDebug
log "linearize adding " + (-correctedIndentationDifference) + " ) "
# if a line is aligned exactly with a line above,
# then we add a ";" as well. So for example in
#
# if a==5:
# ﹍console print "yes a is 5"
# else:
# ﹍console print "no a is not 5"
# console print ". the end."
#
# the if, the else and the last line are all separated by
# ";"
# However, we make some tokens "eat up" the previous ";" in
# via the "removeStatementSeparatorsBeforeAlignedConstructs"
# function, in this case the "else" eats up the ";" separator
# before it, so the if/else construction can work
# properly.
# This is the best way I found to have the "if/else-if/else"
# construct to behave like an expression, and in the process
# I could also get rid of the fakeCatch and fakeElse
# tricks.
if actualLineTabs[k] == startOfThisLine.length
outputSource += (Array(-correctedIndentationDifference+1).join ")") + " ; " + line
else
outputSource += (Array(-correctedIndentationDifference+1).join ")") + line
unclosedParens += correctedIndentationDifference
correctedLineTabs.push unclosedParens
# final close-off of pending parens
outputSource += (Array(unclosedParens+1).join ")")
#log "code length at identifyBlockStarts: " + code.split("\n").length
if tokenizerDebug
log "linearized program: " + outputSource.replace /^[ ]*/g, ""
return outputSource.replace /^[ ]*/g, ""
# since we separate with ";" all aligned lines, we need
# a way to make the program "flow" again in some cases,
# so here we go. Note that this is not strictly necessary,
# you can always indent in non-aligned way, for example
# in the case of the "do" one could write:
#
# for each word in
# ﹍﹍myList
# ﹍do
# ﹍﹍codeToBeRun eval
# something else
#
# and that needs no fixing by this function. However,
# it's rather unconventional to have that second line
# to have TWO tabs, expecially in some cases such as
# if/else-if/else , where you'd have to write:
#
# if a==5:
# ﹍﹍console print "yes a is 5"
# ﹍else:
# ﹍﹍console print "no a is not 5"
# console print ". the end."
#
# which is somewhat unconventional.
# Hence, we apply this trick here.
removeStatementSeparatorsBeforeAlignedConstructs = (command) ->
for eachToken in ["else","catch"]
command = command.replace (new RegExp("[; ]*(" + eachToken + "[ \n])", "g")), " $1"
command = command.replace /[ ]+/g, " "
command = command.trim()
return command
flTokenize = (command) ->
listsStack = []
listsStack.jsArrayPush FLList.createNew()
# join the multi-line first before we do the strings
command = command.replace /\\\n[ ]*/g, " "
# remove comments first, so we can ignore
# any strings that might be in them
command = removeComments command
command = removeStrings command
if tokenizerDebug
log "codeWithoutStrings: " + command
command = linearize command
if tokenizerDebug
log "linearized command: " + command
# let's normalise the input string so we can
# tokenise it just by looking at the spaces.
if tokenizerDebug
log "command before replacements: " + command
command = tokenizeCommand command
if tokenizerDebug
log "command after replacements: " + command
#log "obtained: command after replacements: " + command
command = removeStatementSeparatorsBeforeAlignedConstructs command
if tokenizerDebug
log "removed statement separators before aligned constructs: " + command
simpleTokenization = command.split(" ")
for eachToken in simpleTokenization
if tokenizerDebug
log "eachToken: " + eachToken
if /\$STRING_TOKEN_([\$a-zA-Z0-9_]+)/g.test(eachToken)
if tokenizerDebug
log eachToken + " is a string literal"
listsStack[listsStack.length-1] = listsStack[listsStack.length-1].flListImmutablePush FLToken.createNew eachToken
else if /^\($/.test(eachToken)
nestedList = FLList.createNew()
listsStack.jsArrayPush nestedList
else if /^\)$/.test(eachToken)
nestedList = listsStack.pop()
listsStack[listsStack.length-1] = listsStack[listsStack.length-1].flListImmutablePush nestedList
else
if tokenizerDebug
log eachToken + " is something else"
listsStack[listsStack.length-1] = listsStack[listsStack.length-1].flListImmutablePush FLToken.createNew eachToken
return listsStack[listsStack.length-1]