diff --git a/Sources/_StringProcessing/ConsumerInterface.swift b/Sources/_StringProcessing/ConsumerInterface.swift index 808a1e498..c19996d44 100644 --- a/Sources/_StringProcessing/ConsumerInterface.swift +++ b/Sources/_StringProcessing/ConsumerInterface.swift @@ -391,9 +391,8 @@ extension DSLTree.CustomCharacterClass.Member { return { input, bounds in let curIdx = bounds.lowerBound - let nextIndex = isCharacterSemantic - ? input.index(after: curIdx) - : input.unicodeScalars.index(after: curIdx) + let nextIndex = input.index( + after: curIdx, isScalarSemantics: !isCharacterSemantic) // Under grapheme semantics, we compare based on single NFC scalars. If // such a character is not single scalar under NFC, the match fails. In @@ -603,9 +602,9 @@ extension AST.Atom.CharacterProperty { if p(input, bounds) != nil { return nil } // TODO: bounds check - return opts.semanticLevel == .graphemeCluster - ? input.index(after: bounds.lowerBound) - : input.unicodeScalars.index(after: bounds.lowerBound) + return input.index( + after: bounds.lowerBound, + isScalarSemantics: opts.semanticLevel == .unicodeScalar) } } diff --git a/Sources/_StringProcessing/Engine/MEBuiltins.swift b/Sources/_StringProcessing/Engine/MEBuiltins.swift index 0dafd6720..33b13178b 100644 --- a/Sources/_StringProcessing/Engine/MEBuiltins.swift +++ b/Sources/_StringProcessing/Engine/MEBuiltins.swift @@ -223,6 +223,25 @@ extension String { else { return nil } return next } + + internal func matchRegexDot( + at currentPosition: Index, + limitedBy end: Index, + anyMatchesNewline: Bool, + isScalarSemantics: Bool + ) -> Index? { + guard currentPosition < end else { return nil } + + if anyMatchesNewline { + return index( + after: currentPosition, isScalarSemantics: isScalarSemantics) + } + + return matchAnyNonNewline( + at: currentPosition, + limitedBy: end, + isScalarSemantics: isScalarSemantics) + } } // MARK: - Built-in character class matching diff --git a/Sources/_StringProcessing/Engine/MEQuantify.swift b/Sources/_StringProcessing/Engine/MEQuantify.swift index a0480cde6..09702f7b4 100644 --- a/Sources/_StringProcessing/Engine/MEQuantify.swift +++ b/Sources/_StringProcessing/Engine/MEQuantify.swift @@ -1,26 +1,140 @@ +private typealias ASCIIBitset = DSLTree.CustomCharacterClass.AsciiBitset + extension Processor { - func _doQuantifyMatch(_ payload: QuantifyPayload) -> Input.Index? { + private func maybeASCIIBitset( + _ payload: QuantifyPayload + ) -> ASCIIBitset? { + guard payload.type == .asciiBitset else { return nil } + return registers[payload.bitset] + } + + internal mutating func runQuantify(_ payload: QuantifyPayload) -> Bool { + let asciiBitset = maybeASCIIBitset(payload) + + // TODO: Refactor below called functions to be non-mutating. + // They might need to communicate save-point info upwards in addition to + // a new (optional) currentPosition. Then, we can assert in testing that the + // specialized functions produce the same answer as `runGeneralQuantify`. + switch (payload.quantKind, payload.minTrips, payload.maxExtraTrips) { + case (.reluctant, _, _): + assertionFailure(".reluctant is not supported by .quantify") + // TODO: this was pre-refactoring behavior, should we fatal error + // instead? + return false + case (.eager, 0, nil): + let (next, savePointRange) = input.runEagerZeroOrMoreQuantify( + payload, + asciiBitset: asciiBitset, + at: currentPosition, + limitedBy: end) + assert((next, savePointRange) == input.runGeneralQuantify( + payload, + asciiBitset: asciiBitset, + at: currentPosition, + limitedBy: end)!) + if let savePointRange { + savePoints.append(makeQuantifiedSavePoint( + savePointRange, isScalarSemantics: payload.isScalarSemantics)) + } + currentPosition = next + return true + case (.eager, 1, nil): + guard let (next, savePointRange) = input.runEagerOneOrMoreQuantify( + payload, + asciiBitset: asciiBitset, + at: currentPosition, + limitedBy: end + ) else { + assert(nil == input.runGeneralQuantify( + payload, + asciiBitset: asciiBitset, + at: currentPosition, + limitedBy: end)) + signalFailure() + return false + } + assert((next, savePointRange) == input.runGeneralQuantify( + payload, + asciiBitset: asciiBitset, + at: currentPosition, + limitedBy: end)!) + if let savePointRange { + savePoints.append(makeQuantifiedSavePoint( + savePointRange, isScalarSemantics: payload.isScalarSemantics)) + } + currentPosition = next + return true + case (_, 0, 1): + // FIXME: Is this correct for lazy zero-or-one? + let (next, save) = input.runZeroOrOneQuantify( + payload, + asciiBitset: asciiBitset, + at: currentPosition, + limitedBy: end) + // Also, we should assert same answer as runGeneralQuantify... + if save { + savePoints.append(makeSavePoint(resumingAt: currentPC+1)) + } + currentPosition = next + return true + default: + guard let (next, savePointRange) = input.runGeneralQuantify( + payload, + asciiBitset: asciiBitset, + at: currentPosition, + limitedBy: end + ) else { + assert(nil == input.runGeneralQuantify( + payload, + asciiBitset: asciiBitset, + at: currentPosition, + limitedBy: end)) + signalFailure() + return false + } + assert((next, savePointRange) == input.runGeneralQuantify( + payload, + asciiBitset: asciiBitset, + at: currentPosition, + limitedBy: end)!) + if let savePointRange { + savePoints.append(makeQuantifiedSavePoint( + savePointRange, isScalarSemantics: payload.isScalarSemantics)) + } + currentPosition = next + + return true + } + } +} + +extension String { + fileprivate func doQuantifyMatch( + _ payload: QuantifyPayload, + asciiBitset: ASCIIBitset?, // Necessary ugliness... + at currentPosition: Index, + limitedBy end: Index + ) -> Index? { let isScalarSemantics = payload.isScalarSemantics switch payload.type { case .asciiBitset: - return input.matchASCIIBitset( - registers[payload.bitset], + assert(asciiBitset != nil, "Invariant: needs to be passed in") + return matchASCIIBitset( + asciiBitset!, at: currentPosition, limitedBy: end, isScalarSemantics: isScalarSemantics) case .asciiChar: - return input.matchScalar( + return matchScalar( UnicodeScalar.init(_value: UInt32(payload.asciiChar)), at: currentPosition, limitedBy: end, boundaryCheck: !isScalarSemantics, isCaseInsensitive: false) case .builtin: - guard currentPosition < end else { return nil } - // We only emit .quantify if it consumes a single character - return input.matchBuiltinCC( + return matchBuiltinCC( payload.builtin, at: currentPosition, limitedBy: end, @@ -28,18 +142,10 @@ extension Processor { isStrictASCII: payload.builtinIsStrict, isScalarSemantics: isScalarSemantics) case .any: - guard currentPosition < end else { return nil } - - if payload.anyMatchesNewline { - if isScalarSemantics { - return input.unicodeScalars.index(after: currentPosition) - } - return input.index(after: currentPosition) - } - - return input.matchAnyNonNewline( + return matchRegexDot( at: currentPosition, limitedBy: end, + anyMatchesNewline: payload.anyMatchesNewline, isScalarSemantics: isScalarSemantics) } } @@ -47,16 +153,29 @@ extension Processor { /// Generic quantify instruction interpreter /// - Handles .eager and .posessive /// - Handles arbitrary minTrips and maxExtraTrips - mutating func runQuantify(_ payload: QuantifyPayload) -> Bool { + fileprivate func runGeneralQuantify( + _ payload: QuantifyPayload, + asciiBitset: ASCIIBitset?, + at currentPosition: Index, + limitedBy end: Index + ) -> ( + nextPosition: Index, + savePointRange: Range? + )? { assert(payload.quantKind != .reluctant) var trips = 0 var maxExtraTrips = payload.maxExtraTrips + var currentPosition = currentPosition while trips < payload.minTrips { - guard let next = _doQuantifyMatch(payload) else { - signalFailure() - return false + guard let next = doQuantifyMatch( + payload, + asciiBitset: asciiBitset, + at: currentPosition, + limitedBy: end + ) else { + return nil } currentPosition = next trips += 1 @@ -64,11 +183,16 @@ extension Processor { if maxExtraTrips == 0 { // We're done - return true + return (currentPosition, nil) } - guard let next = _doQuantifyMatch(payload) else { - return true + guard let next = doQuantifyMatch( + payload, + asciiBitset: asciiBitset, + at: currentPosition, + limitedBy: end + ) else { + return (currentPosition, nil) } maxExtraTrips = maxExtraTrips.map { $0 - 1 } @@ -81,7 +205,12 @@ extension Processor { while true { if maxExtraTrips == 0 { break } - guard let next = _doQuantifyMatch(payload) else { + guard let next = doQuantifyMatch( + payload, + asciiBitset: asciiBitset, + at: currentPosition, + limitedBy: end + ) else { break } maxExtraTrips = maxExtraTrips.map({$0 - 1}) @@ -90,77 +219,185 @@ extension Processor { } if payload.quantKind == .eager { - savePoints.append(makeQuantifiedSavePoint( - rangeStart.. (Index, savePointRange: Range?) { assert(payload.quantKind == .eager && payload.minTrips == 0 && payload.maxExtraTrips == nil) - _doRunEagerZeroOrMoreQuantify(payload) + return doRunEagerZeroOrMoreQuantify( + payload, + asciiBitset: asciiBitset, + at: currentPosition, + limitedBy: end) } - // NOTE: So-as to inline into one-or-more call, which makes a significant - // performance difference + // NOTE: inline-always so-as to inline into one-or-more call, which makes a + // significant performance difference @inline(__always) - mutating func _doRunEagerZeroOrMoreQuantify(_ payload: QuantifyPayload) { - guard let next = _doQuantifyMatch(payload) else { - // Consumed no input, no point saved - return - } - + private func doRunEagerZeroOrMoreQuantify( + _ payload: QuantifyPayload, + asciiBitset: ASCIIBitset?, // Necessary ugliness... + at currentPosition: Index, + limitedBy end: Index + ) -> (Index, savePointRange: Range?) { // Create a quantified save point for every part of the input matched up // to the final position. + var currentPosition = currentPosition + let isScalarSemantics = payload.isScalarSemantics let rangeStart = currentPosition var rangeEnd = currentPosition - currentPosition = next - while true { - guard let next = _doQuantifyMatch(payload) else { break } - rangeEnd = currentPosition - currentPosition = next + var matchedOnce = false + + switch payload.type { + case .asciiBitset: + while true { + assert(asciiBitset != nil, "Invariant: needs to be passed in") + guard let next = matchASCIIBitset( + asciiBitset!, + at: currentPosition, + limitedBy: end, + isScalarSemantics: isScalarSemantics) + else { + break + } + matchedOnce = true + rangeEnd = currentPosition + currentPosition = next + assert(currentPosition > rangeEnd) + } + case .asciiChar: + let asciiScalar = UnicodeScalar.init(_value: UInt32(payload.asciiChar)) + while true { + guard let next = matchScalar( + asciiScalar, + at: currentPosition, + limitedBy: end, + boundaryCheck: !isScalarSemantics, + isCaseInsensitive: false) + else { + break + } + matchedOnce = true + rangeEnd = currentPosition + currentPosition = next + assert(currentPosition > rangeEnd) + } + case .builtin: + let builtin = payload.builtin + let isInverted = payload.builtinIsInverted + let isStrictASCII = payload.builtinIsStrict + while true { + guard let next = matchBuiltinCC( + builtin, + at: currentPosition, + limitedBy: end, + isInverted: isInverted, + isStrictASCII: isStrictASCII, + isScalarSemantics: isScalarSemantics) + else { + break + } + matchedOnce = true + rangeEnd = currentPosition + currentPosition = next + assert(currentPosition > rangeEnd) + } + case .any: + let anyMatchesNewline = payload.anyMatchesNewline + while true { + guard let next = matchRegexDot( + at: currentPosition, + limitedBy: end, + anyMatchesNewline: anyMatchesNewline, + isScalarSemantics: isScalarSemantics) + else { + break + } + matchedOnce = true + rangeEnd = currentPosition + currentPosition = next + assert(currentPosition > rangeEnd) + } } - savePoints.append(makeQuantifiedSavePoint(rangeStart.. Bool { + fileprivate func runEagerOneOrMoreQuantify( + _ payload: QuantifyPayload, + asciiBitset: ASCIIBitset?, // Necessary ugliness... + at currentPosition: Index, + limitedBy end: Index + ) -> (Index, savePointRange: Range?)? { assert(payload.quantKind == .eager && payload.minTrips == 1 && payload.maxExtraTrips == nil) // Match at least once - guard let next = _doQuantifyMatch(payload) else { - signalFailure() - return false + // + // NOTE: Due to newline-sequence in scalar-semantic mode advancing two + // positions, we can't just have doRunEagerZeroOrMoreQuantify return the + // range-end and advance the range-start ourselves. Instead, we do one + // call before looping. + guard let next = doQuantifyMatch( + payload, + asciiBitset: asciiBitset, + at: currentPosition, + limitedBy: end + ) else { + return nil } // Run `a+` as `aa*` - currentPosition = next - _doRunEagerZeroOrMoreQuantify(payload) - return true + return doRunEagerZeroOrMoreQuantify( + payload, + asciiBitset: asciiBitset, + at: next, + limitedBy: end) } /// Specialized quantify instruction interpreter for ? - mutating func runZeroOrOneQuantify(_ payload: QuantifyPayload) -> Bool { + fileprivate func runZeroOrOneQuantify( + _ payload: QuantifyPayload, + asciiBitset: ASCIIBitset?, // Necessary ugliness... + at currentPosition: Index, + limitedBy end: Index + ) -> (Index, makeSavePoint: Bool) { assert(payload.minTrips == 0 && payload.maxExtraTrips == 1) - let next = _doQuantifyMatch(payload) - guard let idx = next else { - return true // matched zero times + guard let next = doQuantifyMatch( + payload, + asciiBitset: asciiBitset, + at: currentPosition, + limitedBy: end + ) else { + return (currentPosition, false) } - if payload.quantKind != .possessive { - // Save the zero match - savePoints.append(makeSavePoint(resumingAt: currentPC+1)) - } - currentPosition = idx - return true - } + return (next, payload.quantKind != .possessive) + } } + + diff --git a/Sources/_StringProcessing/Engine/Processor.swift b/Sources/_StringProcessing/Engine/Processor.swift index 86365322b..310b5d932 100644 --- a/Sources/_StringProcessing/Engine/Processor.swift +++ b/Sources/_StringProcessing/Engine/Processor.swift @@ -515,23 +515,7 @@ extension Processor { controller.step() } case .quantify: - let quantPayload = payload.quantify - let matched: Bool - switch (quantPayload.quantKind, quantPayload.minTrips, quantPayload.maxExtraTrips) { - case (.reluctant, _, _): - assertionFailure(".reluctant is not supported by .quantify") - return - case (.eager, 0, nil): - runEagerZeroOrMoreQuantify(quantPayload) - matched = true - case (.eager, 1, nil): - matched = runEagerOneOrMoreQuantify(quantPayload) - case (_, 0, 1): - matched = runZeroOrOneQuantify(quantPayload) - default: - matched = runQuantify(quantPayload) - } - if matched { + if runQuantify(payload.quantify) { controller.step() } diff --git a/Sources/_StringProcessing/Utility/Misc.swift b/Sources/_StringProcessing/Utility/Misc.swift index 8555ec85c..d63370b55 100644 --- a/Sources/_StringProcessing/Utility/Misc.swift +++ b/Sources/_StringProcessing/Utility/Misc.swift @@ -65,3 +65,15 @@ enum QuickResult { case unknown } +extension String { + /// Index after in either grapheme or scalar view + func index(after idx: Index, isScalarSemantics: Bool) -> Index { + if isScalarSemantics { + return unicodeScalars.index(after: idx) + } else { + return index(after: idx) + } + } +} + +