Skip to content

Commit

Permalink
[xtk-ui, with let rfc3349 = false;] rfc3349
Browse files Browse the repository at this point in the history
  • Loading branch information
nnethercote committed Jan 23, 2024
1 parent 071d9ce commit 73a7193
Show file tree
Hide file tree
Showing 15 changed files with 195 additions and 89 deletions.
18 changes: 12 additions & 6 deletions compiler/rustc_ast/src/util/literal.rs
Original file line number Diff line number Diff line change
Expand Up @@ -3,8 +3,7 @@
use crate::ast::{self, LitKind, MetaItemLit, StrStyle};
use crate::token::{self, Token};
use rustc_lexer::unescape::{
byte_from_char, unescape_byte, unescape_c_string, unescape_char, unescape_literal, MixedUnit,
Mode,
unescape_byte, unescape_char, unescape_mixed, unescape_non_mixed, MixedUnit, Mode,
};
use rustc_span::symbol::{kw, sym, Symbol};
use rustc_span::Span;
Expand Down Expand Up @@ -85,7 +84,7 @@ impl LitKind {
// Force-inlining here is aggressive but the closure is
// called on every char in the string, so it can be hot in
// programs with many long strings containing escapes.
unescape_literal(
unescape_non_mixed(
s,
Mode::Str,
&mut #[inline(always)]
Expand All @@ -109,8 +108,15 @@ impl LitKind {
token::ByteStr => {
let s = symbol.as_str();
let mut buf = Vec::with_capacity(s.len());
unescape_literal(s, Mode::ByteStr, &mut |_, c| match c {
Ok(c) => buf.push(byte_from_char(c)),
// We can just use `rfc3349 = true` here, which is more
// permissive than `rfc3349 = false`, because escapes and
// chars were checked by the lexer.
let rfc3349 = true;
unescape_mixed(s, Mode::ByteStr { rfc3349 }, &mut |_, c| match c {
Ok(MixedUnit::Char(c)) => {
buf.extend_from_slice(c.encode_utf8(&mut [0; 4]).as_bytes())
}
Ok(MixedUnit::HighByte(b)) => buf.push(b),
Err(err) => {
assert!(!err.is_fatal(), "failed to unescape string literal")
}
Expand All @@ -126,7 +132,7 @@ impl LitKind {
token::CStr => {
let s = symbol.as_str();
let mut buf = Vec::with_capacity(s.len());
unescape_c_string(s, Mode::CStr, &mut |_span, c| match c {
unescape_mixed(s, Mode::CStr, &mut |_span, c| match c {
Ok(MixedUnit::Char(c)) => {
buf.extend_from_slice(c.encode_utf8(&mut [0; 4]).as_bytes())
}
Expand Down
2 changes: 2 additions & 0 deletions compiler/rustc_ast_passes/src/feature_gate.rs
Original file line number Diff line number Diff line change
Expand Up @@ -508,6 +508,8 @@ pub fn check_crate(krate: &ast::Crate, sess: &Session, features: &Features) {
}
};
}
// njn: right wording?
gate_all!(mixed_utf8_literals, r#"mixed utf8 b"..." and br"..." literals are experimental"#);
gate_all!(
if_let_guard,
"`if let` guards are experimental",
Expand Down
2 changes: 2 additions & 0 deletions compiler/rustc_feature/src/unstable.rs
Original file line number Diff line number Diff line change
Expand Up @@ -520,6 +520,8 @@ declare_features! (
/// standard library until the soundness issues with specialization
/// are fixed.
(unstable, min_specialization, "1.7.0", Some(31844)),
/// Allows mixed utf8 b"..." and br"..." literals.
(unstable, mixed_utf8_literals, "CURRENT_RUSTC_VERSION", Some(116907)),
/// Allows qualified paths in struct expressions, struct patterns and tuple struct patterns.
(unstable, more_qualified_paths, "1.54.0", Some(86935)),
/// Allows the `#[must_not_suspend]` attribute.
Expand Down
90 changes: 60 additions & 30 deletions compiler/rustc_lexer/src/unescape.rs
Original file line number Diff line number Diff line change
Expand Up @@ -9,6 +9,9 @@ use Mode::*;
#[cfg(test)]
mod tests;

// njn: need to add tests in tests/ui/mixed-utf8-literals/; see
// tests/ui/try-block/ for an example to follow

/// Errors and warnings that can occur during string unescaping. They mostly
/// relate to malformed escape sequences, but there are a few that are about
/// other problems.
Expand Down Expand Up @@ -80,12 +83,12 @@ impl EscapeError {
}
}

/// Takes a contents of a literal (without quotes) and produces a sequence of
/// escaped characters or errors.
/// Takes a contents of a non-mixed-utf8 literal (without quotes) and produces
/// a sequence of escaped characters or errors.
///
/// Values are returned by invoking `callback`. For `Char` and `Byte` modes,
/// the callback will be called exactly once.
pub fn unescape_literal<F>(src: &str, mode: Mode, callback: &mut F)
pub fn unescape_non_mixed<F>(src: &str, mode: Mode, callback: &mut F)
where
F: FnMut(Range<usize>, Result<char, EscapeError>),
{
Expand All @@ -95,9 +98,18 @@ where
let res = unescape_char_or_byte(&mut chars, mode);
callback(0..(src.len() - chars.as_str().len()), res);
}
Str | ByteStr => unescape_non_raw_common(src, mode, callback),
RawStr | RawByteStr => check_raw_common(src, mode, callback),
CStr | RawCStr => unreachable!(),
Str => unescape_non_raw_common(src, mode, callback),
RawStr => check_raw_common(src, mode, callback),
RawByteStr { .. } => check_raw_common(src, mode, &mut |r, result| callback(r, result)),
RawCStr => {
check_raw_common(src, mode, &mut |r, mut result| {
if let Ok('\0') = result {
result = Err(EscapeError::NulInCStr);
}
callback(r, result)
});
}
ByteStr { .. } | CStr => unreachable!(),
}
}

Expand Down Expand Up @@ -132,11 +144,16 @@ impl From<u8> for MixedUnit {
}
}

pub fn unescape_c_string<F>(src: &str, mode: Mode, callback: &mut F)
/// Takes a contents of a mixed-utf8 literal (without quotes) and produces
/// a sequence of escaped characters or errors.
///
/// Values are returned by invoking `callback`.
pub fn unescape_mixed<F>(src: &str, mode: Mode, callback: &mut F)
where
F: FnMut(Range<usize>, Result<MixedUnit, EscapeError>),
{
match mode {
ByteStr { .. } => unescape_non_raw_common(src, mode, &mut |r, result| callback(r, result)),
CStr => {
unescape_non_raw_common(src, mode, &mut |r, mut result| {
if let Ok(MixedUnit::Char('\0')) = result {
Expand All @@ -145,16 +162,7 @@ where
callback(r, result)
});
}
RawCStr => {
check_raw_common(src, mode, &mut |r, mut result| {
if let Ok('\0') = result {
result = Err(EscapeError::NulInCStr);
}
// High bytes aren't possible in raw strings.
callback(r, result.map(MixedUnit::Char))
});
}
Char | Byte | Str | RawStr | ByteStr | RawByteStr => unreachable!(),
Char | Byte | Str | RawStr | RawByteStr { .. } | RawCStr => unreachable!(),
}
}

Expand All @@ -180,8 +188,8 @@ pub enum Mode {
Str,
RawStr,

ByteStr,
RawByteStr,
ByteStr { rfc3349: bool },
RawByteStr { rfc3349: bool },

CStr,
RawCStr,
Expand All @@ -190,7 +198,7 @@ pub enum Mode {
impl Mode {
pub fn in_double_quotes(self) -> bool {
match self {
Str | RawStr | ByteStr | RawByteStr | CStr | RawCStr => true,
Str | RawStr | ByteStr { .. } | RawByteStr { .. } | CStr | RawCStr => true,
Char | Byte => false,
}
}
Expand All @@ -199,33 +207,39 @@ impl Mode {
fn allow_high_bytes(self) -> bool {
match self {
Char | Str => false,
Byte | ByteStr | CStr => true,
RawStr | RawByteStr | RawCStr => unreachable!(),
Byte | ByteStr { .. } | CStr => true,
RawStr | RawByteStr { .. } | RawCStr => unreachable!(),
}
}

/// Are unicode (non-ASCII) chars allowed?
#[inline]
fn allow_unicode_chars(self) -> bool {
match self {
Byte | ByteStr | RawByteStr => false,
Char | Str | RawStr | CStr | RawCStr => true,
Byte | ByteStr { rfc3349: false } | RawByteStr { rfc3349: false } => false,
Char
| Str
| RawStr
| ByteStr { rfc3349: true }
| RawByteStr { rfc3349: true }
| CStr
| RawCStr => true,
}
}

/// Are unicode escapes (`\u`) allowed?
fn allow_unicode_escapes(self) -> bool {
match self {
Byte | ByteStr => false,
Char | Str | CStr => true,
RawByteStr | RawStr | RawCStr => unreachable!(),
Byte | ByteStr { rfc3349: false } => false,
Char | Str | ByteStr { rfc3349: true } | CStr => true,
RawByteStr { .. } | RawStr | RawCStr => unreachable!(),
}
}

pub fn prefix_noraw(self) -> &'static str {
match self {
Char | Str | RawStr => "",
Byte | ByteStr | RawByteStr => "b",
Byte | ByteStr { .. } | RawByteStr { .. } => "b",
CStr | RawCStr => "c",
}
}
Expand Down Expand Up @@ -263,12 +277,14 @@ fn scan_escape<T: From<char> + From<u8>>(
Ok(T::from(value as u8))
};
}
// njn: gate: is it a ByteStr?
'u' => return scan_unicode(chars, mode.allow_unicode_escapes()).map(T::from),
_ => return Err(EscapeError::InvalidEscape),
};
Ok(T::from(res))
}

// njn: change arg to mode in precursor?
fn scan_unicode(chars: &mut Chars<'_>, allow_unicode_escapes: bool) -> Result<char, EscapeError> {
// We've parsed '\u', now we have to parse '{..}'.

Expand Down Expand Up @@ -333,6 +349,7 @@ fn unescape_char_or_byte(chars: &mut Chars<'_>, mode: Mode) -> Result<char, Esca
'\\' => scan_escape(chars, mode),
'\n' | '\t' | '\'' => Err(EscapeError::EscapeOnlyChar),
'\r' => Err(EscapeError::BareCarriageReturn),
// njn: this is the only ascii_check that will remain
_ => ascii_check(c, mode.allow_unicode_chars()),
}?;
if chars.next().is_some() {
Expand Down Expand Up @@ -373,6 +390,10 @@ where
}
'"' => Err(EscapeError::EscapeOnlyChar),
'\r' => Err(EscapeError::BareCarriageReturn),

// njn: gate, similar to check_raw_common, check:
// - is it a ByteStr AND does it contain a unicode char

_ => ascii_check(c, allow_unicode_chars).map(T::from),
};
let end = src.len() - chars.as_str().len();
Expand Down Expand Up @@ -424,6 +445,15 @@ where
let start = src.len() - chars.as_str().len() - c.len_utf8();
let res = match c {
'\r' => Err(EscapeError::BareCarriageReturnInRawString),

// njn: gate: need to somehow return an indication of whether
// rfc3349 unicode char allowance was required for this literal,
// i.e. check
// - is it a RawByteStr AND does it contain a unicode char
//
// njn: but the ascii_check itself isn't necessary
// - or make it return three values? ok, ok-with-3349, bad?

_ => ascii_check(c, allow_unicode_chars),
};
let end = src.len() - chars.as_str().len();
Expand All @@ -432,8 +462,8 @@ where
}

#[inline]
pub fn byte_from_char(c: char) -> u8 {
pub(crate) fn byte_from_char(c: char) -> u8 {
let res = c as u32;
debug_assert!(res <= u8::MAX as u32, "guaranteed because of ByteStr");
debug_assert!(res <= u8::MAX as u32, "guaranteed because of Byte");
res as u8
}
25 changes: 15 additions & 10 deletions compiler/rustc_lexer/src/unescape/tests.rs
Original file line number Diff line number Diff line change
Expand Up @@ -100,7 +100,7 @@ fn test_unescape_char_good() {
fn test_unescape_str_warn() {
fn check(literal: &str, expected: &[(Range<usize>, Result<char, EscapeError>)]) {
let mut unescaped = Vec::with_capacity(literal.len());
unescape_literal(literal, Mode::Str, &mut |range, res| unescaped.push((range, res)));
unescape_non_mixed(literal, Mode::Str, &mut |range, res| unescaped.push((range, res)));
assert_eq!(unescaped, expected);
}

Expand All @@ -124,7 +124,7 @@ fn test_unescape_str_warn() {
fn test_unescape_str_good() {
fn check(literal_text: &str, expected: &str) {
let mut buf = Ok(String::with_capacity(literal_text.len()));
unescape_literal(literal_text, Mode::Str, &mut |range, c| {
unescape_non_mixed(literal_text, Mode::Str, &mut |range, c| {
if let Ok(b) = &mut buf {
match c {
Ok(c) => b.push(c),
Expand Down Expand Up @@ -240,16 +240,19 @@ fn test_unescape_byte_good() {
#[test]
fn test_unescape_byte_str_good() {
fn check(literal_text: &str, expected: &[u8]) {
let mut buf = Ok(Vec::with_capacity(literal_text.len()));
unescape_literal(literal_text, Mode::ByteStr, &mut |range, c| {
if let Ok(b) = &mut buf {
let mut buf_res = Ok(Vec::with_capacity(literal_text.len()));
unescape_mixed(literal_text, Mode::ByteStr { rfc3349: false }, &mut |range, c| {
if let Ok(buf) = &mut buf_res {
match c {
Ok(c) => b.push(byte_from_char(c)),
Err(e) => buf = Err((range, e)),
Ok(MixedUnit::Char(c)) => {
buf.extend_from_slice(c.encode_utf8(&mut [0; 4]).as_bytes())
}
Ok(MixedUnit::HighByte(b)) => buf.push(b),
Err(e) => buf_res = Err((range, e)),
}
}
});
assert_eq!(buf.as_deref(), Ok(expected))
assert_eq!(buf_res.as_deref(), Ok(expected))
}

check("foo", b"foo");
Expand All @@ -264,7 +267,7 @@ fn test_unescape_byte_str_good() {
fn test_unescape_raw_str() {
fn check(literal: &str, expected: &[(Range<usize>, Result<char, EscapeError>)]) {
let mut unescaped = Vec::with_capacity(literal.len());
unescape_literal(literal, Mode::RawStr, &mut |range, res| unescaped.push((range, res)));
unescape_non_mixed(literal, Mode::RawStr, &mut |range, res| unescaped.push((range, res)));
assert_eq!(unescaped, expected);
}

Expand All @@ -276,7 +279,9 @@ fn test_unescape_raw_str() {
fn test_unescape_raw_byte_str() {
fn check(literal: &str, expected: &[(Range<usize>, Result<char, EscapeError>)]) {
let mut unescaped = Vec::with_capacity(literal.len());
unescape_literal(literal, Mode::RawByteStr, &mut |range, res| unescaped.push((range, res)));
unescape_non_mixed(literal, Mode::RawByteStr { rfc3349: false }, &mut |range, res| {
unescaped.push((range, res))
});
assert_eq!(unescaped, expected);
}

Expand Down
4 changes: 4 additions & 0 deletions compiler/rustc_parse/messages.ftl
Original file line number Diff line number Diff line change
Expand Up @@ -814,6 +814,10 @@ parse_unexpected_vert_vert_before_function_parameter = unexpected `||` before fu
parse_unexpected_vert_vert_in_pattern = unexpected token `||` in pattern
.suggestion = use a single `|` to separate multiple alternative patterns
# njn:
# - b'\u{1234}' error says "unicode escape in byte string", should be "byte literal"
# - after rfc3349 stabilizes, byte literal wil be the only error case here
# - could add a `.desc` field in a precursor
parse_unicode_escape_in_byte = unicode escape in byte string
.label = {parse_unicode_escape_in_byte}
.help = unicode escape sequences cannot be used as a byte or in a byte string
Expand Down
Loading

0 comments on commit 73a7193

Please sign in to comment.