[xtk-ui, with let rfc3349 = false;] rfc3349

rust-lang · Jan 23, 2024 · 73a7193 · 73a7193
1 parent 071d9ce
commit 73a7193
Show file tree

Hide file tree

Showing 15 changed files with 195 additions and 89 deletions.
diff --git a/compiler/rustc_ast/src/util/literal.rs b/compiler/rustc_ast/src/util/literal.rs
@@ -3,8 +3,7 @@
 use crate::ast::{self, LitKind, MetaItemLit, StrStyle};
 use crate::token::{self, Token};
 use rustc_lexer::unescape::{
-    byte_from_char, unescape_byte, unescape_c_string, unescape_char, unescape_literal, MixedUnit,
-    Mode,
+    unescape_byte, unescape_char, unescape_mixed, unescape_non_mixed, MixedUnit, Mode,
 };
 use rustc_span::symbol::{kw, sym, Symbol};
 use rustc_span::Span;
@@ -85,7 +84,7 @@ impl LitKind {
                     // Force-inlining here is aggressive but the closure is
                     // called on every char in the string, so it can be hot in
                     // programs with many long strings containing escapes.
-                    unescape_literal(
+                    unescape_non_mixed(
                         s,
                         Mode::Str,
                         &mut #[inline(always)]
@@ -109,8 +108,15 @@ impl LitKind {
             token::ByteStr => {
                 let s = symbol.as_str();
                 let mut buf = Vec::with_capacity(s.len());
-                unescape_literal(s, Mode::ByteStr, &mut |_, c| match c {
-                    Ok(c) => buf.push(byte_from_char(c)),
+                // We can just use `rfc3349 = true` here, which is more
+                // permissive than `rfc3349 = false`,  because escapes and
+                // chars were checked by the lexer.
+                let rfc3349 = true;
+                unescape_mixed(s, Mode::ByteStr { rfc3349 }, &mut |_, c| match c {
+                    Ok(MixedUnit::Char(c)) => {
+                        buf.extend_from_slice(c.encode_utf8(&mut [0; 4]).as_bytes())
+                    }
+                    Ok(MixedUnit::HighByte(b)) => buf.push(b),
                     Err(err) => {
                         assert!(!err.is_fatal(), "failed to unescape string literal")
                     }
@@ -126,7 +132,7 @@ impl LitKind {
             token::CStr => {
                 let s = symbol.as_str();
                 let mut buf = Vec::with_capacity(s.len());
-                unescape_c_string(s, Mode::CStr, &mut |_span, c| match c {
+                unescape_mixed(s, Mode::CStr, &mut |_span, c| match c {
                     Ok(MixedUnit::Char(c)) => {
                         buf.extend_from_slice(c.encode_utf8(&mut [0; 4]).as_bytes())
                     }

diff --git a/compiler/rustc_ast_passes/src/feature_gate.rs b/compiler/rustc_ast_passes/src/feature_gate.rs
@@ -508,6 +508,8 @@ pub fn check_crate(krate: &ast::Crate, sess: &Session, features: &Features) {
             }
         };
     }
+    // njn: right wording?
+    gate_all!(mixed_utf8_literals, r#"mixed utf8 b"..." and br"..." literals are experimental"#);
     gate_all!(
         if_let_guard,
         "`if let` guards are experimental",

diff --git a/compiler/rustc_feature/src/unstable.rs b/compiler/rustc_feature/src/unstable.rs
@@ -520,6 +520,8 @@ declare_features! (
     /// standard library until the soundness issues with specialization
     /// are fixed.
     (unstable, min_specialization, "1.7.0", Some(31844)),
+    /// Allows mixed utf8 b"..." and br"..." literals.
+    (unstable, mixed_utf8_literals, "CURRENT_RUSTC_VERSION", Some(116907)),
     /// Allows qualified paths in struct expressions, struct patterns and tuple struct patterns.
     (unstable, more_qualified_paths, "1.54.0", Some(86935)),
     /// Allows the `#[must_not_suspend]` attribute.

diff --git a/compiler/rustc_lexer/src/unescape.rs b/compiler/rustc_lexer/src/unescape.rs
@@ -9,6 +9,9 @@ use Mode::*;
 #[cfg(test)]
 mod tests;
 
+// njn: need to add tests in tests/ui/mixed-utf8-literals/; see
+// tests/ui/try-block/ for an example to follow
+
 /// Errors and warnings that can occur during string unescaping. They mostly
 /// relate to malformed escape sequences, but there are a few that are about
 /// other problems.
@@ -80,12 +83,12 @@ impl EscapeError {
     }
 }
 
-/// Takes a contents of a literal (without quotes) and produces a sequence of
-/// escaped characters or errors.
+/// Takes a contents of a non-mixed-utf8 literal (without quotes) and produces
+/// a sequence of escaped characters or errors.
 ///
 /// Values are returned by invoking `callback`. For `Char` and `Byte` modes,
 /// the callback will be called exactly once.
-pub fn unescape_literal<F>(src: &str, mode: Mode, callback: &mut F)
+pub fn unescape_non_mixed<F>(src: &str, mode: Mode, callback: &mut F)
 where
     F: FnMut(Range<usize>, Result<char, EscapeError>),
 {
@@ -95,9 +98,18 @@ where
             let res = unescape_char_or_byte(&mut chars, mode);
             callback(0..(src.len() - chars.as_str().len()), res);
         }
-        Str | ByteStr => unescape_non_raw_common(src, mode, callback),
-        RawStr | RawByteStr => check_raw_common(src, mode, callback),
-        CStr | RawCStr => unreachable!(),
+        Str => unescape_non_raw_common(src, mode, callback),
+        RawStr => check_raw_common(src, mode, callback),
+        RawByteStr { .. } => check_raw_common(src, mode, &mut |r, result| callback(r, result)),
+        RawCStr => {
+            check_raw_common(src, mode, &mut |r, mut result| {
+                if let Ok('\0') = result {
+                    result = Err(EscapeError::NulInCStr);
+                }
+                callback(r, result)
+            });
+        }
+        ByteStr { .. } | CStr => unreachable!(),
     }
 }
 
@@ -132,11 +144,16 @@ impl From<u8> for MixedUnit {
     }
 }
 
-pub fn unescape_c_string<F>(src: &str, mode: Mode, callback: &mut F)
+/// Takes a contents of a mixed-utf8 literal (without quotes) and produces
+/// a sequence of escaped characters or errors.
+///
+/// Values are returned by invoking `callback`.
+pub fn unescape_mixed<F>(src: &str, mode: Mode, callback: &mut F)
 where
     F: FnMut(Range<usize>, Result<MixedUnit, EscapeError>),
 {
     match mode {
+        ByteStr { .. } => unescape_non_raw_common(src, mode, &mut |r, result| callback(r, result)),
         CStr => {
             unescape_non_raw_common(src, mode, &mut |r, mut result| {
                 if let Ok(MixedUnit::Char('\0')) = result {
@@ -145,16 +162,7 @@ where
                 callback(r, result)
             });
         }
-        RawCStr => {
-            check_raw_common(src, mode, &mut |r, mut result| {
-                if let Ok('\0') = result {
-                    result = Err(EscapeError::NulInCStr);
-                }
-                // High bytes aren't possible in raw strings.
-                callback(r, result.map(MixedUnit::Char))
-            });
-        }
-        Char | Byte | Str | RawStr | ByteStr | RawByteStr => unreachable!(),
+        Char | Byte | Str | RawStr | RawByteStr { .. } | RawCStr => unreachable!(),
     }
 }
 
@@ -180,8 +188,8 @@ pub enum Mode {
     Str,
     RawStr,
 
-    ByteStr,
-    RawByteStr,
+    ByteStr { rfc3349: bool },
+    RawByteStr { rfc3349: bool },
 
     CStr,
     RawCStr,
@@ -190,7 +198,7 @@ pub enum Mode {
 impl Mode {
     pub fn in_double_quotes(self) -> bool {
         match self {
-            Str | RawStr | ByteStr | RawByteStr | CStr | RawCStr => true,
+            Str | RawStr | ByteStr { .. } | RawByteStr { .. } | CStr | RawCStr => true,
             Char | Byte => false,
         }
     }
@@ -199,33 +207,39 @@ impl Mode {
     fn allow_high_bytes(self) -> bool {
         match self {
             Char | Str => false,
-            Byte | ByteStr | CStr => true,
-            RawStr | RawByteStr | RawCStr => unreachable!(),
+            Byte | ByteStr { .. } | CStr => true,
+            RawStr | RawByteStr { .. } | RawCStr => unreachable!(),
         }
     }
 
     /// Are unicode (non-ASCII) chars allowed?
     #[inline]
     fn allow_unicode_chars(self) -> bool {
         match self {
-            Byte | ByteStr | RawByteStr => false,
-            Char | Str | RawStr | CStr | RawCStr => true,
+            Byte | ByteStr { rfc3349: false } | RawByteStr { rfc3349: false } => false,
+            Char
+            | Str
+            | RawStr
+            | ByteStr { rfc3349: true }
+            | RawByteStr { rfc3349: true }
+            | CStr
+            | RawCStr => true,
         }
     }
 
     /// Are unicode escapes (`\u`) allowed?
     fn allow_unicode_escapes(self) -> bool {
         match self {
-            Byte | ByteStr => false,
-            Char | Str | CStr => true,
-            RawByteStr | RawStr | RawCStr => unreachable!(),
+            Byte | ByteStr { rfc3349: false } => false,
+            Char | Str | ByteStr { rfc3349: true } | CStr => true,
+            RawByteStr { .. } | RawStr | RawCStr => unreachable!(),
         }
     }
 
     pub fn prefix_noraw(self) -> &'static str {
         match self {
             Char | Str | RawStr => "",
-            Byte | ByteStr | RawByteStr => "b",
+            Byte | ByteStr { .. } | RawByteStr { .. } => "b",
             CStr | RawCStr => "c",
         }
     }
@@ -263,12 +277,14 @@ fn scan_escape<T: From<char> + From<u8>>(
                 Ok(T::from(value as u8))
             };
         }
+        // njn: gate: is it a ByteStr?
         'u' => return scan_unicode(chars, mode.allow_unicode_escapes()).map(T::from),
         _ => return Err(EscapeError::InvalidEscape),
     };
     Ok(T::from(res))
 }
 
+// njn: change arg to mode in precursor?
 fn scan_unicode(chars: &mut Chars<'_>, allow_unicode_escapes: bool) -> Result<char, EscapeError> {
     // We've parsed '\u', now we have to parse '{..}'.
 
@@ -333,6 +349,7 @@ fn unescape_char_or_byte(chars: &mut Chars<'_>, mode: Mode) -> Result<char, Esca
         '\\' => scan_escape(chars, mode),
         '\n' | '\t' | '\'' => Err(EscapeError::EscapeOnlyChar),
         '\r' => Err(EscapeError::BareCarriageReturn),
+        // njn: this is the only ascii_check that will remain
         _ => ascii_check(c, mode.allow_unicode_chars()),
     }?;
     if chars.next().is_some() {
@@ -373,6 +390,10 @@ where
             }
             '"' => Err(EscapeError::EscapeOnlyChar),
             '\r' => Err(EscapeError::BareCarriageReturn),
+
+            // njn: gate, similar to check_raw_common, check:
+            // - is it a ByteStr AND does it contain a unicode char
+
             _ => ascii_check(c, allow_unicode_chars).map(T::from),
         };
         let end = src.len() - chars.as_str().len();
@@ -424,6 +445,15 @@ where
         let start = src.len() - chars.as_str().len() - c.len_utf8();
         let res = match c {
             '\r' => Err(EscapeError::BareCarriageReturnInRawString),
+
+            // njn: gate: need to somehow return an indication of whether
+            // rfc3349 unicode char allowance was required for this literal,
+            // i.e. check
+            // - is it a RawByteStr AND does it contain a unicode char
+            //
+            // njn: but the ascii_check itself isn't necessary
+            // - or make it return three values? ok, ok-with-3349, bad?
+
             _ => ascii_check(c, allow_unicode_chars),
         };
         let end = src.len() - chars.as_str().len();
@@ -432,8 +462,8 @@ where
 }
 
 #[inline]
-pub fn byte_from_char(c: char) -> u8 {
+pub(crate) fn byte_from_char(c: char) -> u8 {
     let res = c as u32;
-    debug_assert!(res <= u8::MAX as u32, "guaranteed because of ByteStr");
+    debug_assert!(res <= u8::MAX as u32, "guaranteed because of Byte");
     res as u8
 }
diff --git a/compiler/rustc_lexer/src/unescape/tests.rs b/compiler/rustc_lexer/src/unescape/tests.rs
@@ -100,7 +100,7 @@ fn test_unescape_char_good() {
 fn test_unescape_str_warn() {
     fn check(literal: &str, expected: &[(Range<usize>, Result<char, EscapeError>)]) {
         let mut unescaped = Vec::with_capacity(literal.len());
-        unescape_literal(literal, Mode::Str, &mut |range, res| unescaped.push((range, res)));
+        unescape_non_mixed(literal, Mode::Str, &mut |range, res| unescaped.push((range, res)));
         assert_eq!(unescaped, expected);
     }
 
@@ -124,7 +124,7 @@ fn test_unescape_str_warn() {
 fn test_unescape_str_good() {
     fn check(literal_text: &str, expected: &str) {
         let mut buf = Ok(String::with_capacity(literal_text.len()));
-        unescape_literal(literal_text, Mode::Str, &mut |range, c| {
+        unescape_non_mixed(literal_text, Mode::Str, &mut |range, c| {
             if let Ok(b) = &mut buf {
                 match c {
                     Ok(c) => b.push(c),
@@ -240,16 +240,19 @@ fn test_unescape_byte_good() {
 #[test]
 fn test_unescape_byte_str_good() {
     fn check(literal_text: &str, expected: &[u8]) {
-        let mut buf = Ok(Vec::with_capacity(literal_text.len()));
-        unescape_literal(literal_text, Mode::ByteStr, &mut |range, c| {
-            if let Ok(b) = &mut buf {
+        let mut buf_res = Ok(Vec::with_capacity(literal_text.len()));
+        unescape_mixed(literal_text, Mode::ByteStr { rfc3349: false }, &mut |range, c| {
+            if let Ok(buf) = &mut buf_res {
                 match c {
-                    Ok(c) => b.push(byte_from_char(c)),
-                    Err(e) => buf = Err((range, e)),
+                    Ok(MixedUnit::Char(c)) => {
+                        buf.extend_from_slice(c.encode_utf8(&mut [0; 4]).as_bytes())
+                    }
+                    Ok(MixedUnit::HighByte(b)) => buf.push(b),
+                    Err(e) => buf_res = Err((range, e)),
                 }
             }
         });
-        assert_eq!(buf.as_deref(), Ok(expected))
+        assert_eq!(buf_res.as_deref(), Ok(expected))
     }
 
     check("foo", b"foo");
@@ -264,7 +267,7 @@ fn test_unescape_byte_str_good() {
 fn test_unescape_raw_str() {
     fn check(literal: &str, expected: &[(Range<usize>, Result<char, EscapeError>)]) {
         let mut unescaped = Vec::with_capacity(literal.len());
-        unescape_literal(literal, Mode::RawStr, &mut |range, res| unescaped.push((range, res)));
+        unescape_non_mixed(literal, Mode::RawStr, &mut |range, res| unescaped.push((range, res)));
         assert_eq!(unescaped, expected);
     }
 
@@ -276,7 +279,9 @@ fn test_unescape_raw_str() {
 fn test_unescape_raw_byte_str() {
     fn check(literal: &str, expected: &[(Range<usize>, Result<char, EscapeError>)]) {
         let mut unescaped = Vec::with_capacity(literal.len());
-        unescape_literal(literal, Mode::RawByteStr, &mut |range, res| unescaped.push((range, res)));
+        unescape_non_mixed(literal, Mode::RawByteStr { rfc3349: false }, &mut |range, res| {
+            unescaped.push((range, res))
+        });
         assert_eq!(unescaped, expected);
     }
 

diff --git a/compiler/rustc_parse/messages.ftl b/compiler/rustc_parse/messages.ftl
@@ -814,6 +814,10 @@ parse_unexpected_vert_vert_before_function_parameter = unexpected `||` before fu
 parse_unexpected_vert_vert_in_pattern = unexpected token `||` in pattern
     .suggestion = use a single `|` to separate multiple alternative patterns
 
+# njn:
+# - b'\u{1234}' error says "unicode escape in byte string", should be "byte literal"
+# - after rfc3349 stabilizes, byte literal wil be the only error case here
+# - could add a `.desc` field in a precursor
 parse_unicode_escape_in_byte = unicode escape in byte string
     .label = {parse_unicode_escape_in_byte}
     .help = unicode escape sequences cannot be used as a byte or in a byte string