diff --git a/Sources/_StringProcessing/ByteCodeGen.swift b/Sources/_StringProcessing/ByteCodeGen.swift index e0a6c7465..4d422d24e 100644 --- a/Sources/_StringProcessing/ByteCodeGen.swift +++ b/Sources/_StringProcessing/ByteCodeGen.swift @@ -310,10 +310,13 @@ fileprivate extension Compiler.ByteCodeGen { let intercept = builder.makeAddress() let success = builder.makeAddress() - builder.buildSave(success) - builder.buildSave(intercept) + // Positive lookaheads propagate captures through the success SP + builder.buildSave(success, keepsCaptures: true) + // Negative lookaheads should not propagate captures, so the intercept SP + // does not keep captures + let id = builder.buildSaveWithID(intercept, keepsCaptures: false) try emitNode(child) - builder.buildClearThrough(intercept) + builder.buildClearThrough(id) if !positive { builder.buildClear() } @@ -347,10 +350,10 @@ fileprivate extension Compiler.ByteCodeGen { let intercept = builder.makeAddress() let success = builder.makeAddress() - builder.buildSaveAddress(success) - builder.buildSave(intercept) + builder.buildSaveAddress(success, keepsCaptures: true) + let id = builder.buildSaveWithID(intercept) try emitNode(child) - builder.buildClearThrough(intercept) + builder.buildClearThrough(id) builder.buildFail() builder.label(intercept) @@ -569,8 +572,9 @@ fileprivate extension Compiler.ByteCodeGen { } // Set up a dummy save point for possessive to update + var dummyID: SavePointID? = nil if updatedKind == .possessive { - builder.pushEmptySavePoint() + dummyID = builder.pushEmptySavePoint() } // min-trip-count: @@ -620,13 +624,16 @@ fileprivate extension Compiler.ByteCodeGen { } // exit-policy: + // // condBranch(to: exit, ifZeroElseDecrement: %extraTrips) // // // SavePoint { SavePoint( + id: id, pc: pc, pos: addressOnly ? nil : currentPosition, rangeStart: nil, rangeEnd: nil, stackEnd: .init(callStack.count), - captureEnds: storedCaptures, + captureEnds: keepCaptures ? nil : storedCaptures, intRegisters: registers.ints, posRegisters: registers.positions) } @@ -91,6 +95,7 @@ extension Processor { func startQuantifierSavePoint() -> SavePoint { // Restores to the instruction AFTER the current quantifier instruction SavePoint( + id: nil, pc: controller.pc + 1, pos: nil, rangeStart: nil, diff --git a/Sources/_StringProcessing/Engine/InstPayload.swift b/Sources/_StringProcessing/Engine/InstPayload.swift index f6d5bfcc7..341db227c 100644 --- a/Sources/_StringProcessing/Engine/InstPayload.swift +++ b/Sources/_StringProcessing/Engine/InstPayload.swift @@ -261,6 +261,12 @@ extension Instruction.Payload { var capture: CaptureRegister { interpret() } + init(saveID: SavePointID) { + self.init(saveID) + } + var saveID: SavePointID { + interpret() + } // MARK: Packed operand payloads @@ -348,6 +354,17 @@ extension Instruction.Payload { } // MARK: Struct payloads + init( + save: InstructionAddress, + id: SavePointID?, + splitTo: InstructionAddress? = nil, + keepsCaptures: Bool + ) { + self.init(SavePayload(address: save, id: id, splitTo: splitTo, keepsCaptures: keepsCaptures).rawValue) + } + var savePayload: SavePayload { + SavePayload.init(rawValue: rawValue & _payloadMask) + } init(_ model: _CharacterClassModel) { self.init(CharacterClassPayload(model).rawValue) @@ -613,3 +630,57 @@ struct AssertionPayload: RawRepresentable { } } } + +struct SavePayload: RawRepresentable { + let rawValue: UInt64 + + init(rawValue: UInt64) { + self.rawValue = rawValue + assert(rawValue & _opcodeMask == 0) + } + + init( + address: InstructionAddress, + id: SavePointID?, + splitTo: InstructionAddress?, + keepsCaptures: Bool + ) { + let idVal: UInt64 + if let id = id { + idVal = UInt64(id.rawValue) << 9 + } else { + idVal = 1 << 8 + } + let splitVal: UInt64 + if let splitAddr = splitTo { + splitVal = UInt64(splitAddr.rawValue) << 19 + } else { + splitVal = 1 << 18 + } + let keepsCapturesVal: UInt64 = keepsCaptures ? 1 << 30 : 0 + assert(UInt64(address.rawValue) & idVal & splitVal & keepsCapturesVal == 0) + self.rawValue = UInt64(address.rawValue) + idVal + splitVal + keepsCapturesVal + } + + var addr: InstructionAddress { + TypedInt(self.rawValue & 0xFF) + } + + var hasID: Bool { + self.rawValue & (1 << 8) == 0 + } + var id: SavePointID { + TypedInt((self.rawValue >> 9) & 0xFF) + } + + var hasSplit: Bool { + self.rawValue & (1 << 18) == 0 + } + var split: InstructionAddress { + TypedInt((self.rawValue >> 19) & 0xFF) + } + + var keepsCaptures: Bool { + self.rawValue & (1 << 30) != 0 + } +} diff --git a/Sources/_StringProcessing/Engine/Instruction.swift b/Sources/_StringProcessing/Engine/Instruction.swift index 21ab90a03..86c88c0de 100644 --- a/Sources/_StringProcessing/Engine/Instruction.swift +++ b/Sources/_StringProcessing/Engine/Instruction.swift @@ -183,6 +183,9 @@ extension Instruction { /// /// Precondition: There is a save point to remove case clear + + /// Remove the save point with the given id + case clearPossessiveQuantDummy /// Remove save points up to and including the operand /// diff --git a/Sources/_StringProcessing/Engine/MEBuilder.swift b/Sources/_StringProcessing/Engine/MEBuilder.swift index 4b623fbda..1fad7ffbb 100644 --- a/Sources/_StringProcessing/Engine/MEBuilder.swift +++ b/Sources/_StringProcessing/Engine/MEBuilder.swift @@ -51,6 +51,8 @@ extension MEProgram { // We currently deduce the capture count from the capture register number. nextCaptureRegister.rawValue } + + var nextSavePointID: SavePointID = 0 init() {} } @@ -60,11 +62,28 @@ extension MEProgram.Builder { struct AddressFixup { var first: AddressToken var second: AddressToken? = nil - - init(_ a: AddressToken) { self.first = a } - init(_ a: AddressToken, _ b: AddressToken) { + var id: SavePointID? + var keepsCaptures: Bool + + init( + _ a: AddressToken, + id: SavePointID?, + keepsCaptures: Bool + ) { + self.first = a + self.id = id + self.keepsCaptures = keepsCaptures + } + init( + _ a: AddressToken, + _ b: AddressToken, + id: SavePointID?, + keepsCaptures: Bool + ) { self.first = a self.second = b + self.id = id + self.keepsCaptures = keepsCaptures } } } @@ -117,27 +136,42 @@ extension MEProgram.Builder { fixup(to: t) } - mutating func buildSave(_ t: AddressToken) { + mutating func buildSave(_ t: AddressToken, keepsCaptures: Bool = false) { instructions.append(.init(.save)) - fixup(to: t) + fixup(to: t, keepsCaptures: keepsCaptures) } - mutating func buildSaveAddress(_ t: AddressToken) { + mutating func buildSaveAddress(_ t: AddressToken, keepsCaptures: Bool = false) { instructions.append(.init(.saveAddress)) - fixup(to: t) + fixup(to: t, keepsCaptures: keepsCaptures) + } + + mutating func buildSaveWithID(_ t: AddressToken, keepsCaptures: Bool = false) -> SavePointID { + let id = makeSavePointID() + instructions.append(.init(.save)) + fixup(to: t, id: id, keepsCaptures: keepsCaptures) + return id + } + mutating func buildSaveAddressWithID(_ t: AddressToken) -> SavePointID { + let id = makeSavePointID() + instructions.append(.init(.saveAddress)) + fixup(to: t, id: id) + return id } mutating func buildSplit( - to: AddressToken, saving: AddressToken + to: AddressToken, saving: AddressToken, id: SavePointID? = nil ) { instructions.append(.init(.splitSaving)) - fixup(to: (to, saving)) + fixup(to: (to, saving), id: id) } mutating func buildClear() { instructions.append(.init(.clear)) } - mutating func buildClearThrough(_ t: AddressToken) { - instructions.append(.init(.clearThrough)) - fixup(to: t) + mutating func buildClear(possessiveQuantDummy id: SavePointID) { + instructions.append(.init(.clearPossessiveQuantDummy, .init(saveID: id))) + } + mutating func buildClearThrough(_ id: SavePointID) { + instructions.append(.init(.clearThrough, .init(saveID: id))) } mutating func buildFail() { instructions.append(.init(.fail)) @@ -357,15 +391,16 @@ extension MEProgram.Builder { payload = .init(addr: addr, int: inst.payload.int) case .condBranchSamePosition: payload = .init(addr: addr, position: inst.payload.position) - case .branch, .save, .saveAddress, .clearThrough: + case .branch: payload = .init(addr: addr) - + case .save, .saveAddress: + payload = .init(save: addr, id: tok.id, keepsCaptures: tok.keepsCaptures) case .splitSaving: guard let fix2 = tok.second else { throw Unreachable("TODO: reason") } let saving = addressTokens[fix2.rawValue]! - payload = .init(addr: addr, addr2: saving) + payload = .init(save: saving, id: tok.id, splitTo: addr, keepsCaptures: tok.keepsCaptures) default: throw Unreachable("TODO: reason") @@ -435,22 +470,28 @@ extension MEProgram.Builder { // Associate the most recently added instruction with // the provided token, ensuring it is fixed up during // assembly - mutating func fixup(to t: AddressToken) { + mutating func fixup( + to t: AddressToken, + id: SavePointID? = nil, + keepsCaptures: Bool = false + ) { assert(!instructions.isEmpty) addressFixups.append( - (InstructionAddress(instructions.endIndex-1), .init(t))) + (InstructionAddress(instructions.endIndex-1), .init(t, id: id, keepsCaptures: keepsCaptures))) } // Associate the most recently added instruction with // the provided tokens, ensuring it is fixed up during // assembly mutating func fixup( - to ts: (AddressToken, AddressToken) + to ts: (AddressToken, AddressToken), + id: SavePointID? = nil, + keepsCaptures: Bool = false ) { assert(!instructions.isEmpty) addressFixups.append(( InstructionAddress(instructions.endIndex-1), - .init(ts.0, ts.1))) + .init(ts.0, ts.1, id: id, keepsCaptures: keepsCaptures))) } // Push an "empty" save point which will, upon restore, just restore from @@ -459,11 +500,11 @@ extension MEProgram.Builder { // // This is useful for possessive quantification that needs some initial save // point to "ratchet" upon a successful match. - mutating func pushEmptySavePoint() { + mutating func pushEmptySavePoint() -> SavePointID { if failAddressToken == nil { failAddressToken = makeAddress() } - buildSaveAddress(failAddressToken!) + return buildSaveAddressWithID(failAddressToken!) } } @@ -528,6 +569,10 @@ extension MEProgram.Builder { defer { nextPositionRegister.rawValue += 1 } return r } + mutating func makeSavePointID() -> SavePointID { + defer { nextSavePointID.rawValue += 1 } + return nextSavePointID + } // TODO: A register-mapping helper struct, which could release // registers without monotonicity required diff --git a/Sources/_StringProcessing/Engine/Processor.swift b/Sources/_StringProcessing/Engine/Processor.swift index 18e355fb5..86fd2e96c 100644 --- a/Sources/_StringProcessing/Engine/Processor.swift +++ b/Sources/_StringProcessing/Engine/Processor.swift @@ -117,6 +117,7 @@ extension Processor { self.searchBounds = searchBounds self.matchMode = matchMode self.isTracingEnabled = isTracingEnabled + // self.isTracingEnabled = true self.shouldMeasureMetrics = shouldMeasureMetrics self.currentPosition = searchBounds.lowerBound @@ -375,7 +376,7 @@ extension Processor { pc: InstructionAddress, pos: Position?, stackEnd: CallStackAddress, - captureEnds: [_StoredCapture], + captureEnds: [_StoredCapture]?, intRegisters: [Int], PositionRegister: [Input.Index] ) @@ -393,14 +394,16 @@ extension Processor { } assert(stackEnd.rawValue <= callStack.count) - assert(capEnds.count == storedCaptures.count) controller.pc = pc currentPosition = pos ?? currentPosition callStack.removeLast(callStack.count - stackEnd.rawValue) - storedCaptures = capEnds registers.ints = intRegisters registers.positions = posRegisters + if let capEnds = capEnds { + assert(capEnds.count == storedCaptures.count) + storedCaptures = capEnds + } if shouldMeasureMetrics { metrics.backtracks += 1 } } @@ -426,9 +429,9 @@ extension Processor { } } - mutating func clearThrough(_ address: InstructionAddress) { + mutating func clearThrough(_ idToClear: SavePointID) { while let sp = savePoints.popLast() { - if sp.pc == address { + if let id = sp.id, id == idToClear { controller.step() return } @@ -441,7 +444,7 @@ extension Processor { _checkInvariants() assert(state == .inProgress) -#if PROCESSOR_MEASUREMENTS_ENABLED +// #if PROCESSOR_MEASUREMENTS_ENABLED if cycleCount == 0 { trace() measureMetrics() @@ -452,7 +455,7 @@ extension Processor { measureMetrics() _checkInvariants() } -#endif +// #endif let (opcode, payload) = fetch().destructure switch opcode { @@ -489,22 +492,40 @@ extension Processor { controller.step() } case .save: + // lily fixme: merge the three cases of save together + let payload = payload.savePayload let resumeAddr = payload.addr - let sp = makeSavePoint(resumeAddr) + let sp: SavePoint + if payload.hasID { + sp = makeSavePoint(resumeAddr, withID: payload.id, keepCaptures: payload.keepsCaptures) + } else { + sp = makeSavePoint(resumeAddr, keepCaptures: payload.keepsCaptures) + } savePoints.append(sp) controller.step() case .saveAddress: + let payload = payload.savePayload let resumeAddr = payload.addr - let sp = makeSavePoint(resumeAddr, addressOnly: true) + let sp: SavePoint + if payload.hasID { + sp = makeSavePoint(resumeAddr, addressOnly: true, withID: payload.id, keepCaptures: payload.keepsCaptures) + } else { + sp = makeSavePoint(resumeAddr, addressOnly: true, keepCaptures: payload.keepsCaptures) + } savePoints.append(sp) controller.step() case .splitSaving: - let (nextPC, resumeAddr) = payload.pairedAddrAddr - let sp = makeSavePoint(resumeAddr) + let payload = payload.savePayload + let sp: SavePoint + if payload.hasID { + sp = makeSavePoint(payload.addr, withID: payload.id, keepCaptures: payload.keepsCaptures) + } else { + sp = makeSavePoint(payload.addr, keepCaptures: payload.keepsCaptures) + } savePoints.append(sp) - controller.pc = nextPC + controller.pc = payload.split case .clear: if let _ = savePoints.popLast() { @@ -515,8 +536,23 @@ extension Processor { } case .clearThrough: - clearThrough(payload.addr) - + clearThrough(payload.saveID) + + case .clearPossessiveQuantDummy: + let idToClear = payload.saveID + if let latest = savePoints.last, let id = latest.id, id == idToClear { + // If we have save points within the posessive quantifier, we can't clear + // the dummy save point just yet + + // The last time we exit the possessive quantifier, the last one will + // be the dummy with the correct id, and only then do we clear it + + // Since we only leave the dummy SP if there are save points below it, + // and those save points below it must restore into the reluctant quantifier (lily note: check this) + // we will eventually clear the dummy in this instruction + savePoints.removeLast() + } + controller.step() case .accept: tryAccept() diff --git a/Sources/_StringProcessing/Engine/Tracing.swift b/Sources/_StringProcessing/Engine/Tracing.swift index 725319b00..9ed2b51ea 100644 --- a/Sources/_StringProcessing/Engine/Tracing.swift +++ b/Sources/_StringProcessing/Engine/Tracing.swift @@ -89,15 +89,21 @@ extension Instruction: CustomStringConvertible { case .quantify: let payload = payload.quantify return "\(opcode) \(payload.type) \(payload.minTrips) \(payload.extraTrips?.description ?? "unbounded" )" - case .save: + case .save, .saveAddress: + let payload = payload.savePayload let resumeAddr = payload.addr - return "\(opcode) \(resumeAddr)" - case .saveAddress: - let resumeAddr = payload.addr - return "\(opcode) \(resumeAddr)" + if payload.hasID { + return "\(opcode) \(resumeAddr) keepsCaptures? \(payload.keepsCaptures) id: \(payload.id)" + } + return "\(opcode) \(resumeAddr) keepsCaptures? \(payload.keepsCaptures)" case .splitSaving: - let (nextPC, resumeAddr) = payload.pairedAddrAddr - return "\(opcode) saving: \(resumeAddr) jumpingTo: \(nextPC)" + let payload = payload.savePayload + if payload.hasID { + return "\(opcode) saving: \(payload.addr) (id: \(payload.id)) jumpingTo: \(payload.split)" + } + return "\(opcode) saving: \(payload.addr) jumpingTo: \(payload.split)" + case .clearPossessiveQuantDummy: + return "\(opcode) clearingIfLatestSavePointID == \(payload.saveID)" case .transformCapture: let (cap, trans) = payload.pairedCaptureTransform return "\(opcode) trans[\(trans)](\(cap))" @@ -121,8 +127,14 @@ extension Processor.SavePoint { posStr = "\(startStr)...\(endStr)" } } + let idStr: String + if let id = self.id { + idStr = ", id: \(id)" + } else { + idStr = "" + } return """ - pc: \(self.pc), pos: \(posStr), stackEnd: \(stackEnd) + pc: \(self.pc), pos: \(posStr), stackEnd: \(stackEnd), keepsCaptures? \(self.captureEnds == nil)\(idStr) """ } } diff --git a/Sources/_StringProcessing/Utility/TypedInt.swift b/Sources/_StringProcessing/Utility/TypedInt.swift index e03f2572f..82998e8bd 100644 --- a/Sources/_StringProcessing/Utility/TypedInt.swift +++ b/Sources/_StringProcessing/Utility/TypedInt.swift @@ -110,6 +110,8 @@ enum _PositionStackAddress {} typealias SavePointStackAddress = TypedInt<_SavePointAddress> enum _SavePointAddress {} +typealias SavePointID = TypedInt<_SavePointID> +enum _SavePointID {} // MARK: - Registers diff --git a/Tests/RegexTests/CompileTests.swift b/Tests/RegexTests/CompileTests.swift index 752921e19..657854c59 100644 --- a/Tests/RegexTests/CompileTests.swift +++ b/Tests/RegexTests/CompileTests.swift @@ -27,6 +27,7 @@ enum DecodedInstr { case splitSaving case clear case clearThrough + case clearReluctantQuantDummy case accept case fail case advance @@ -81,6 +82,8 @@ extension DecodedInstr { return .clear case .clearThrough: return .clearThrough + case .clearReluctantQuantDummy: + return .clearReluctantQuantDummy case .accept: return .accept case .fail: diff --git a/Tests/RegexTests/MatchTests.swift b/Tests/RegexTests/MatchTests.swift index 2fa9b9381..c05e69ce7 100644 --- a/Tests/RegexTests/MatchTests.swift +++ b/Tests/RegexTests/MatchTests.swift @@ -391,7 +391,7 @@ extension RegexTests { #"a(?#. comment)b"#, input: "123abcxyz", match: "ab") } - func testMatchQuantification() { + func testMatchQuantification() throws { // MARK: Quantification firstMatchTest( @@ -460,6 +460,11 @@ extension RegexTests { firstMatchTests( ".*+x", ("abc", nil), ("abcx", nil), ("", nil)) + + // Make sure that possessive quant doesn't clear out existing save points + let r = try Regex("a?(ab)?+") + let match = try r.wholeMatch(in: "ab") + XCTAssertEqual(match?.0, "ab") firstMatchTests( "a+b", @@ -576,15 +581,18 @@ extension RegexTests { ("baaaaabc", nil), ("baaaaaaaabc", nil)) - // XFAIL'd possessive tests + // Possessive quantifier tests + // Ensure that save points are correctly handled in/through possessive quant firstMatchTests( "a?+a", - ("a", nil), - xfail: true) + ("a", nil)) + // Save points made inside the possessive quant should be kept and correctly + // restored to firstMatchTests( "(a|a)?+a", - ("a", nil), - xfail: true) + ("a", nil)) + // Even when restoring to an SP inside of a possessive quant, we must + // discard the save points from quantification correctly firstMatchTests( "(a|a){2,4}+a", ("a", nil), @@ -592,8 +600,7 @@ extension RegexTests { firstMatchTests( "(a|a){2,4}+a", ("aaa", nil), - ("aaaa", nil), - xfail: true) + ("aaaa", nil)) firstMatchTests( "(?:a{2,4}?b)+", @@ -1616,8 +1623,7 @@ extension RegexTests { // (?:a?+) and (?>a?+) are equivalent: they match one 'a' if available firstMatchTests( #"^(?:a?+)a$"#, - (input: "a", match: nil), - xfail: true) + (input: "a", match: nil)) firstMatchTests( #"^(?:a?+)a$"#, (input: "aa", match: "aa"), @@ -1640,8 +1646,7 @@ extension RegexTests { firstMatchTests( #"(?>(\d+))\w+\1"#, (input: "23x23", match: "23x23"), - (input: "123x23", match: "23x23"), - xfail: true) + (input: "123x23", match: "23x23")) // Backreferences in scalar mode // In scalar mode the backreference should not match @@ -1659,12 +1664,10 @@ extension RegexTests { (input: "abbba", match: nil), (input: "ABBA", match: nil), (input: "defABBAdef", match: nil)) - // FIXME: Backreferences don't escape positive lookaheads firstMatchTests( #"^(?=.*(.)(.)\2\1).+\2$"#, (input: "ABBAB", match: "ABBAB"), - (input: "defABBAdefB", match: "defABBAdefB"), - xfail: true) + (input: "defABBAdefB", match: "defABBAdefB")) firstMatchTests( #"^(?!.*(.)(.)\2\1).+$"#, @@ -2559,6 +2562,28 @@ extension RegexTests { } func testFuzzerArtifacts() throws { - expectCompletion(regex: #"(b?)\1*"#, in: "a") + // rdar://98517792 +// expectCompletion(regex: #"(b?)\1*"#, in: "a") +// +// // rdar://98852151 +// firstMatchTest(#"\p{L}?+\p{L}"#, input: "a", match: nil) +// firstMatchTest(#"🧟‍♀️?+🧟‍♀️"#, input: "🧟‍♀️", match: nil) +// firstMatchTest(#"🧟‍♀️{,3}+🧟‍♀️"#, input: "🧟‍♀️🧟‍♀️🧟‍♀️", match: nil) + + // rdar://98738984 + firstMatchTest( + #"(?>(a))\1"#, + input: #"aa"#, + match: "aa") + // lookbehind is not yet implemented + firstMatchTest( + #"a(?<=(a))\1"#, + input: #"aa"#, + match: "aa", + xfail: true) + firstMatchTest( + #"(?=(a))\1"#, + input: #"a"#, + match: "a") } }