From dce751f507b0ca581cd5b860b37a73611227f92a Mon Sep 17 00:00:00 2001 From: John Davis Date: Wed, 30 Oct 2024 13:23:42 -0700 Subject: [PATCH 1/2] [rust] Skip unnecessary calls to find_among This change ports an optimization from the C stemmer generator to the rust generator. In order to determine whether find_among and find_among_b are worth calling, the generated C checks that the current string is long enough to potentially match the shortest string literal in the among, then does some bitwise magic to check that the characters in the current string at length of the end of the shortest string in the among are the same as the character at that position in at least one of the among strings. If the current string is too short or that character doesn't match any of the amongs at that position, then we can fail-fast instead of calling find_among. This change allows the generated rust stemmers to similarly skip find_among calls where we can quickly determine that find_among would not succeed. In some stemming benchmarks I've been running, this improves the performance of the rust english stemmer by between 40 and 50%. --- compiler/generator_rust.c | 111 +++++++++++++++++++++++++++++++++++++- 1 file changed, 109 insertions(+), 2 deletions(-) diff --git a/compiler/generator_rust.c b/compiler/generator_rust.c index deb2ee06..b5ac2225 100644 --- a/compiler/generator_rust.c +++ b/compiler/generator_rust.c @@ -171,8 +171,9 @@ static void write_failure(struct generator * g) { g->unreachable = true; break; default: - g->I[0] = g->failure_label; - w(g, "~Mbreak 'lab~I0;~N"); + w(g, "~Mbreak 'lab"); + write_int(g, g->failure_label); + w(g, ";~N"); g->unreachable = true; } } @@ -1046,11 +1047,116 @@ static void generate_functionend(struct generator * g, struct node * p) { static void generate_substring(struct generator * g, struct node * p) { struct among * x = p->among; + int block = -1; + unsigned int bitmap = 0; + struct amongvec * among_cases = x->b; + int c; + int empty_case = -1; + int n_cases = 0; + symbol cases[2]; + int shortest_size = x->shortest_size; + int block_opened = 0; write_comment(g, p); g->S[0] = p->mode == m_forward ? "" : "_b"; g->I[0] = x->number; + g->I[1] = x->literalstring_count; + + /* In forward mode with non-ASCII UTF-8 characters, the first byte + * of the string will often be the same, so instead look at the last + * common byte position. + * + * In backward mode, we can't match if there are fewer characters before + * the current position than the minimum length. + */ + for (c = 0; c < x->literalstring_count; ++c) { + symbol ch; + if (among_cases[c].size == 0) { + empty_case = c; + continue; + } + if (p->mode == m_forward) { + ch = among_cases[c].b[shortest_size - 1]; + } else { + ch = among_cases[c].b[among_cases[c].size - 1]; + } + if (n_cases == 0) { + block = ch >> 5; + } else if (ch >> 5 != block) { + block = -1; + if (n_cases > 2) break; + } + if (block == -1) { + if (n_cases > 0 && ch == cases[0]) continue; + if (n_cases < 2) { + cases[n_cases++] = ch; + } else if (ch != cases[1]) { + ++n_cases; + break; + } + } else { + if ((bitmap & (1u << (ch & 0x1f))) == 0) { + bitmap |= 1u << (ch & 0x1f); + if (n_cases < 2) cases[n_cases] = ch; + ++n_cases; + } + } + } + + if (block != -1 || n_cases <= 2) { + char buf[64]; + g->I[2] = block; + g->I[3] = bitmap; + g->I[4] = shortest_size - 1; + if (p->mode == m_forward) { + sprintf(buf, "env.current.as_bytes()[(env.cursor + %d) as usize]", shortest_size - 1); + g->S[1] = buf; + if (shortest_size == 1) { + writef(g, "~Mif (env.cursor >= env.limit", p); + } else { + writef(g, "~Mif (env.cursor + ~I4 >= env.limit", p); + } + } else { + g->S[1] = "env.current.as_bytes()[(env.cursor - 1) as usize]"; + if (shortest_size == 1) { + writef(g, "~Mif (env.cursor <= env.limit_backward", p); + } else { + writef(g, "~Mif (env.cursor - ~I4 <= env.limit_backward", p); + } + } + if (n_cases == 0) { + /* We get this for the degenerate case: among ( '' ) + * This doesn't seem to be a useful construct, but it is + * syntactically valid. + */ + } else if (n_cases == 1) { + g->I[4] = cases[0]; + writef(g, " || ~S1 as i32 != ~I4 as i32", p); + } else if (n_cases == 2) { + g->I[4] = cases[0]; + g->I[5] = cases[1]; + writef(g, " || (~S1 as i32 != ~I4 as i32 && ~S1 as i32 != ~I5 as i32)", p); + } else { + writef(g, " || ~S1 as i32 >> 5 != ~I2 as i32 || !((~I3 as i32 >> (~S1 as i32 & 0x1f)) & 1) != 0", p); + } + write_string(g, ") "); + if (empty_case != -1) { + /* If the among includes the empty string, it can never fail + * so not matching the bitmap means we match the empty string. + */ + g->I[4] = among_cases[empty_case].result; + writef(g, "{among_var = ~I4;}~N~Melse ", p); + write_block_start(g); + block_opened = 1; + } else { + writef(g, "~f~C", p); + } + } else { +#ifdef OPTIMISATION_WARNINGS + printf("Couldn't shortcut among %d\n", x->number); +#endif + } if (x->amongvar_needed) { writef(g, "~Mamong_var = env.find_among~S0(~A_~I0, context);~N", p); @@ -1062,6 +1168,7 @@ static void generate_substring(struct generator * g, struct node * p) { } else { write_failure_if(g, "env.find_among~S0(~A_~I0, context) == 0", p); } + if (block_opened) write_block_end(g); } static void generate_among(struct generator * g, struct node * p) { From 9506b39b618e2c0199561243c5d8226e0a9c3525 Mon Sep 17 00:00:00 2001 From: John Davis Date: Fri, 1 Nov 2024 14:56:39 -0700 Subject: [PATCH 2/2] [rust] Fix types in find_among skip checks --- compiler/generator_rust.c | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/compiler/generator_rust.c b/compiler/generator_rust.c index b5ac2225..bdd63285 100644 --- a/compiler/generator_rust.c +++ b/compiler/generator_rust.c @@ -1132,13 +1132,13 @@ static void generate_substring(struct generator * g, struct node * p) { */ } else if (n_cases == 1) { g->I[4] = cases[0]; - writef(g, " || ~S1 as i32 != ~I4 as i32", p); + writef(g, " || ~S1 as u8 != ~I4 as u8", p); } else if (n_cases == 2) { g->I[4] = cases[0]; g->I[5] = cases[1]; - writef(g, " || (~S1 as i32 != ~I4 as i32 && ~S1 as i32 != ~I5 as i32)", p); + writef(g, " || (~S1 as u8 != ~I4 as u8 && ~S1 as u8 != ~I5 as u8)", p); } else { - writef(g, " || ~S1 as i32 >> 5 != ~I2 as i32 || !((~I3 as i32 >> (~S1 as i32 & 0x1f)) & 1) != 0", p); + writef(g, " || ~S1 as u8 >> 5 != ~I2 as u8 || ((~I3 as i32 >> (~S1 as u8 & 0x1f)) & 1) == 0", p); } write_string(g, ") "); if (empty_case != -1) {