-
Notifications
You must be signed in to change notification settings - Fork 23
/
Copy pathregexp_nfa.go
302 lines (272 loc) · 9.21 KB
/
regexp_nfa.go
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
package quamina
import (
"errors"
"fmt"
"sort"
)
// RunePair and related types exported to facilitate building Unicode tables in code_gen
type RunePair struct {
Lo, Hi rune
}
type RuneRange []RunePair
type runeRangeIterator struct {
pairs RuneRange
whichPair int
inPair rune
}
func newRuneRangeIterator(rr RuneRange) (*runeRangeIterator, error) {
if len(rr) == 0 {
return nil, errors.New("empty range")
}
sort.Slice(rr, func(i, j int) bool { return rr[i].Lo < rr[j].Lo })
return &runeRangeIterator{pairs: rr, whichPair: 0, inPair: rr[0].Lo}, nil
}
// In the regular expressions represented by the I-Regexp syntax, the | connector has the lowest
// precedence, so at the top level, it's a slice of what the ABNF calls branches - generate an NFA
// for each branch and then take their union.
// Inside a branch, the structure is obviously recursive because of the ()-group, which itself can
// carry a slice of branches etc. Aside from that, the branch contains a sequence of atom/quantifier
// pairs. All the "atom" syntax options describe ranges of characters and are well-represented by
// the RuneRange type. This includes [] and \[pP]{whatever}.
// All the forms of quantifiers can be described by pairs of numbers. ? is [0,1]. + is [1,♾️]. * is [0,♾️].
// {m,n} ranges also, obviously.
type quantifiedAtom struct {
isDot bool
runes RuneRange
quantMin int
quantMax int
subtree regexpRoot // if non-nil, ()-enclosed subtree here
}
type regexpBranch []*quantifiedAtom
type regexpRoot []regexpBranch
// makeRegexpNFA traverses the parsed regexp tree and generates a finite automaton
// that matches it. If forField is true, then the FA will have states that match " at the beginning
// and end.
func makeRegexpNFA(root regexpRoot, forField bool) (*smallTable, *fieldMatcher) {
nextField := newFieldMatcher()
nextStep := makeNFATrailer(nextField)
if forField {
table := makeSmallTable(nil, []byte{'"'}, []*faNext{nextStep})
state := &faState{table: table}
nextStep = &faNext{states: []*faState{state}}
}
return makeNFAFromBranches(root, nextStep, forField), nextField
}
func makeNFAFromBranches(root regexpRoot, nextStep *faNext, forField bool) *smallTable {
// completely empty regexp
if len(root) == 0 {
return makeSmallTable(nil, []byte{'"'}, []*faNext{nextStep})
}
fa := newSmallTable()
for _, branch := range root {
var nextBranch *smallTable
if len(branch) == 0 {
nextBranch = makeSmallTable(nil, []byte{'"'}, []*faNext{nextStep})
} else {
nextBranch = makeOneRegexpBranchFA(branch, nextStep, forField)
}
fa = mergeFAs(fa, nextBranch, sharedNullPrinter)
}
return fa
}
// makeOneRegexpBranchFA - We know what the last step looks like, so we proceed back to
// front through the members of the branch, which are quantified atoms. Each can be a runeRange (which
// can be a single character or a dot or a subtree, in each case followed by a quantifier.
// We know the last step, which points at the nextField argument.
// There's a problem here. Quamina's match* methods feed string values including the enclosing "" to
// the automaton. This is useful for a variety of reasons. But that means if the regexp is a|b, because
// the | has the lowest precedence, it'd build an automaton that would match ("a)|(b"). So we need to
// just build the automaton on a|b and then manually fasten "-transitions in front of and behind it.
func makeOneRegexpBranchFA(branch regexpBranch, nextStep *faNext, forField bool) *smallTable {
var step *faNext
var table *smallTable
// TODO: Assuming this works, rewrite a bunch of other make*NFA calls in this style, without recursion
for index := len(branch) - 1; index >= 0; index-- {
qa := branch[index]
if qa.isDot {
table = makeDotFA(nextStep)
step = &faNext{states: []*faState{{table: table}}}
} else if qa.subtree != nil {
table = makeNFAFromBranches(qa.subtree, nextStep, false)
step = &faNext{states: []*faState{{table: table}}}
} else {
// it's a rune range
if qa.quantMin != 1 || qa.quantMax != 1 {
panic("Not supported: quantifiers")
}
// just match a rune
table = makeRuneRangeNFA(qa.runes, nextStep, sharedNullPrinter)
step = &faNext{states: []*faState{{table: table}}}
}
nextStep = step
}
if forField {
firstState := &faState{table: table}
firstStep := &faNext{states: []*faState{firstState}}
table = makeSmallTable(nil, []byte{'"'}, []*faNext{firstStep})
}
return table
}
// makeNFATrailer generates the last two steps in every NFA, because all field values end with the
// valueTerminator marker, so you need the field-matched state and you need another state that branches
// to it based on valueTerminator
// TODO: Prove that this is useful in other make*NFA scenarios
func makeNFATrailer(nextField *fieldMatcher) *faNext {
matchState := &faState{
table: newSmallTable(),
fieldTransitions: []*fieldMatcher{nextField},
}
matchStep := &faNext{[]*faState{matchState}}
table := makeSmallTable(nil, []byte{valueTerminator}, []*faNext{matchStep})
return &faNext{states: []*faState{{table: table}}}
}
// plan B
type runeTreeEntry struct {
next *faNext
child runeTreeNode
}
type runeTreeNode []*runeTreeEntry
func addRuneTreeEntry(root runeTreeNode, r rune, dest *faNext) {
// this works because no UTF-8 representation of a code point can be a prefix of any other
node := root
bytes, err := runeToUTF8(r)
// Invalid bytes should be caught at another level, but if they show up here, silently ignore
if err != nil {
return
}
// find or make entry
for i, b := range bytes {
if node[b] != nil {
node = node[b].child
continue
}
// need to make a new node
entry := &runeTreeEntry{}
node[b] = entry
if i == len(bytes)-1 {
entry.next = dest
} else {
entry.child = make([]*runeTreeEntry, byteCeiling)
}
node = entry.child
}
}
func nfaFromRuneTree(root runeTreeNode, pp printer) *smallTable {
return tableFromRuneTreeNode(root, pp)
}
func tableFromRuneTreeNode(node runeTreeNode, pp printer) *smallTable {
var unpacked unpackedTable
for b, entry := range node {
if entry == nil {
continue
}
if entry.next != nil {
unpacked[b] = entry.next
} else {
table := tableFromRuneTreeNode(entry.child, pp)
pp.labelTable(table, fmt.Sprintf("on %x", b))
state := &faState{table: table}
unpacked[b] = &faNext{states: []*faState{state}}
}
}
st := newSmallTable()
st.pack(&unpacked)
return st
}
func makeRuneRangeNFA(rr RuneRange, next *faNext, pp printer) *smallTable {
pp.labelTable(next.states[0].table, "Next")
// turn the slice of hi/lo inclusive endpoints into a slice of utf8 encodings
ri, err := newRuneRangeIterator(rr)
// can't happen I think
if err != nil {
panic("Invalid rune range")
}
var root runeTreeNode = make([]*runeTreeEntry, byteCeiling)
// for each rune
for r := ri.next(); r != -1; r = ri.next() {
addRuneTreeEntry(root, r, next)
}
return nfaFromRuneTree(root, pp)
}
func makeDotFA(dest *faNext) *smallTable {
sLast := &smallTable{
ceilings: []byte{0x80, 0xc0, byte(byteCeiling)},
steps: []*faNext{nil, dest, nil},
}
targetLast := &faNext{states: []*faState{{table: sLast}}}
sLastInter := &smallTable{
ceilings: []byte{0x80, 0xc0, byte(byteCeiling)},
steps: []*faNext{nil, targetLast, nil},
}
targetLastInter := &faNext{states: []*faState{{table: sLastInter}}}
sFirstInter := &smallTable{
ceilings: []byte{0x80, 0xc0, byte(byteCeiling)},
steps: []*faNext{nil, targetLastInter, nil},
}
targetFirstInter := &faNext{states: []*faState{{table: sFirstInter}}}
sE0 := &smallTable{
ceilings: []byte{0xa0, 0xc0, byte(byteCeiling)},
steps: []*faNext{nil, targetLast, nil},
}
targetE0 := &faNext{states: []*faState{{table: sE0}}}
sED := &smallTable{
ceilings: []byte{0x80, 0xA0, byte(byteCeiling)},
steps: []*faNext{nil, targetLast, nil},
}
targetED := &faNext{states: []*faState{{table: sED}}}
sF0 := &smallTable{
ceilings: []byte{0x90, 0xC0, byte(byteCeiling)},
steps: []*faNext{nil, targetLastInter, nil},
}
targetF0 := &faNext{states: []*faState{{table: sF0}}}
sF4 := &smallTable{
ceilings: []byte{0x80, 0x90, byte(byteCeiling)},
steps: []*faNext{nil, targetLastInter, nil},
}
targetF4 := &faNext{states: []*faState{{table: sF4}}}
// for reference, see https://www.tbray.org/ongoing/When/202x/2024/12/29/Matching-Dot-Redux
return &smallTable{
ceilings: []byte{
0x80, // 0
0xC2, // 1
0xE0, // 2
0xE1, // 3
0xED, // 4
0xEE, // 5
0xF0, // 6
0xF1, // 7
0xF4, // 8
0xF5, // 9
byte(byteCeiling), // 10
},
steps: []*faNext{
dest, // 0
nil, // 1
targetLast, // 2
targetE0, // 3
targetLastInter, // 4
targetED, // 5
targetLastInter, // 6
targetF0, // 7
targetFirstInter, // 8
targetF4, // 9
nil, // 10
},
}
}
func (i *runeRangeIterator) next() rune {
if i.inPair <= i.pairs[i.whichPair].Hi {
r := i.inPair
i.inPair++
return r
}
// will blow up on empty pair, could put a check in, or just don't generate them
// while parsing regexp
i.whichPair++
if i.whichPair == len(i.pairs) {
return -1
}
r := i.pairs[i.whichPair].Lo
i.inPair = r + 1
return r
}