Compare commits
4 Commits
perf-node-
...
charlie/pa
| Author | SHA1 | Date | |
|---|---|---|---|
|
|
9ae4fb3b9f | ||
|
|
c67d68271d | ||
|
|
56b148bb43 | ||
|
|
0a5a4f6d92 |
18
Cargo.lock
generated
18
Cargo.lock
generated
@@ -217,12 +217,12 @@ checksum = "327762f6e5a765692301e5bb513e0d9fef63be86bbc14528052b1cd3e6f03e07"
|
||||
|
||||
[[package]]
|
||||
name = "bstr"
|
||||
version = "1.6.2"
|
||||
version = "1.9.0"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "4c2f7349907b712260e64b0afe2f84692af14a454be26187d9df565c7f69266a"
|
||||
checksum = "c48f0051a4b4c5e0b6d365cd04af53aeaa209e3cc15ec2cdb69e73cc87fbd0dc"
|
||||
dependencies = [
|
||||
"memchr",
|
||||
"regex-automata 0.3.9",
|
||||
"regex-automata 0.4.3",
|
||||
"serde",
|
||||
]
|
||||
|
||||
@@ -1921,12 +1921,6 @@ dependencies = [
|
||||
"regex-syntax 0.6.29",
|
||||
]
|
||||
|
||||
[[package]]
|
||||
name = "regex-automata"
|
||||
version = "0.3.9"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "59b23e92ee4318893fa3fe3e6fb365258efbfe6ac6ab30f090cdcbb7aa37efa9"
|
||||
|
||||
[[package]]
|
||||
name = "regex-automata"
|
||||
version = "0.4.3"
|
||||
@@ -2342,16 +2336,22 @@ version = "0.0.0"
|
||||
dependencies = [
|
||||
"anyhow",
|
||||
"bitflags 2.4.1",
|
||||
"bstr",
|
||||
"codspeed-criterion-compat",
|
||||
"criterion",
|
||||
"insta",
|
||||
"is-macro",
|
||||
"itertools 0.12.1",
|
||||
"lalrpop",
|
||||
"lalrpop-util",
|
||||
"memchr",
|
||||
"mimalloc",
|
||||
"once_cell",
|
||||
"ruff_python_ast",
|
||||
"ruff_text_size",
|
||||
"rustc-hash",
|
||||
"static_assertions",
|
||||
"tikv-jemallocator",
|
||||
"tiny-keccak",
|
||||
"unicode-ident",
|
||||
"unicode_names2",
|
||||
|
||||
@@ -19,6 +19,7 @@ argfile = { version = "0.1.6" }
|
||||
assert_cmd = { version = "2.0.13" }
|
||||
bincode = { version = "1.3.3" }
|
||||
bitflags = { version = "2.4.1" }
|
||||
bstr = { version = "1.9.0" }
|
||||
cachedir = { version = "0.3.1" }
|
||||
chrono = { version = "0.4.33", default-features = false, features = ["clock"] }
|
||||
clap = { version = "4.4.18", features = ["derive"] }
|
||||
|
||||
@@ -40,7 +40,9 @@ impl Violation for HardcodedBindAllInterfaces {
|
||||
pub(crate) fn hardcoded_bind_all_interfaces(checker: &mut Checker, string: StringLike) {
|
||||
let is_bind_all_interface = match string {
|
||||
StringLike::StringLiteral(ast::ExprStringLiteral { value, .. }) => value == "0.0.0.0",
|
||||
StringLike::FStringLiteral(ast::FStringLiteralElement { value, .. }) => value == "0.0.0.0",
|
||||
StringLike::FStringLiteral(ast::FStringLiteralElement { value, .. }) => {
|
||||
&**value == "0.0.0.0"
|
||||
}
|
||||
StringLike::BytesLiteral(_) => return,
|
||||
};
|
||||
|
||||
|
||||
@@ -15,7 +15,7 @@ fn to_f_string_expression_element(inner: &Expr) -> ast::FStringElement {
|
||||
/// Convert a string to a [`ast::FStringElement::Literal`].
|
||||
pub(super) fn to_f_string_literal_element(s: &str) -> ast::FStringElement {
|
||||
ast::FStringElement::Literal(ast::FStringLiteralElement {
|
||||
value: s.to_owned(),
|
||||
value: s.to_string().into_boxed_str(),
|
||||
range: TextRange::default(),
|
||||
})
|
||||
}
|
||||
@@ -53,7 +53,7 @@ pub(super) fn to_f_string_element(expr: &Expr) -> Option<ast::FStringElement> {
|
||||
match expr {
|
||||
Expr::StringLiteral(ast::ExprStringLiteral { value, range }) => {
|
||||
Some(ast::FStringElement::Literal(ast::FStringLiteralElement {
|
||||
value: value.to_string(),
|
||||
value: value.to_string().into_boxed_str(),
|
||||
range: *range,
|
||||
}))
|
||||
}
|
||||
|
||||
@@ -644,7 +644,7 @@ pub struct ComparableBytesLiteral<'a> {
|
||||
impl<'a> From<&'a ast::BytesLiteral> for ComparableBytesLiteral<'a> {
|
||||
fn from(bytes_literal: &'a ast::BytesLiteral) -> Self {
|
||||
Self {
|
||||
value: bytes_literal.value.as_slice(),
|
||||
value: &bytes_literal.value,
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
@@ -949,7 +949,7 @@ impl Ranged for FStringExpressionElement {
|
||||
#[derive(Clone, Debug, PartialEq)]
|
||||
pub struct FStringLiteralElement {
|
||||
pub range: TextRange,
|
||||
pub value: String,
|
||||
pub value: Box<str>,
|
||||
}
|
||||
|
||||
impl Ranged for FStringLiteralElement {
|
||||
@@ -962,7 +962,7 @@ impl Deref for FStringLiteralElement {
|
||||
type Target = str;
|
||||
|
||||
fn deref(&self) -> &Self::Target {
|
||||
self.value.as_str()
|
||||
&self.value
|
||||
}
|
||||
}
|
||||
|
||||
@@ -1607,7 +1607,7 @@ impl Default for BytesLiteralValueInner {
|
||||
#[derive(Clone, Debug, Default, PartialEq)]
|
||||
pub struct BytesLiteral {
|
||||
pub range: TextRange,
|
||||
pub value: Vec<u8>,
|
||||
pub value: Box<[u8]>,
|
||||
}
|
||||
|
||||
impl Ranged for BytesLiteral {
|
||||
@@ -1620,7 +1620,7 @@ impl Deref for BytesLiteral {
|
||||
type Target = [u8];
|
||||
|
||||
fn deref(&self) -> &Self::Target {
|
||||
self.value.as_slice()
|
||||
&self.value
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
@@ -12,6 +12,11 @@ license = { workspace = true }
|
||||
build = "build.rs"
|
||||
|
||||
[lib]
|
||||
bench = false
|
||||
|
||||
[[bench]]
|
||||
name = "string"
|
||||
harness = false
|
||||
|
||||
[dependencies]
|
||||
ruff_python_ast = { path = "../ruff_python_ast" }
|
||||
@@ -19,14 +24,24 @@ ruff_text_size = { path = "../ruff_text_size" }
|
||||
|
||||
anyhow = { workspace = true }
|
||||
bitflags = { workspace = true }
|
||||
bstr = { workspace = true }
|
||||
is-macro = { workspace = true }
|
||||
itertools = { workspace = true }
|
||||
lalrpop-util = { workspace = true, default-features = false }
|
||||
memchr = { workspace = true }
|
||||
unicode-ident = { workspace = true }
|
||||
unicode_names2 = { workspace = true }
|
||||
rustc-hash = { workspace = true }
|
||||
static_assertions = { workspace = true }
|
||||
unicode-ident = { workspace = true }
|
||||
unicode_names2 = { workspace = true }
|
||||
once_cell = "1.19.0"
|
||||
criterion = { workspace = true, default-features = false }
|
||||
codspeed-criterion-compat = { workspace = true, default-features = false, optional = true}
|
||||
|
||||
[target.'cfg(target_os = "windows")'.dev-dependencies]
|
||||
mimalloc = { workspace = true }
|
||||
|
||||
[target.'cfg(all(not(target_os = "windows"), not(target_os = "openbsd"), any(target_arch = "x86_64", target_arch = "aarch64", target_arch = "powerpc64")))'.dev-dependencies]
|
||||
tikv-jemallocator = { workspace = true }
|
||||
|
||||
[dev-dependencies]
|
||||
insta = { workspace = true }
|
||||
|
||||
93
crates/ruff_python_parser/benches/string.rs
Normal file
93
crates/ruff_python_parser/benches/string.rs
Normal file
@@ -0,0 +1,93 @@
|
||||
use criterion::{
|
||||
black_box, criterion_group, criterion_main, measurement::WallTime, BatchSize, Criterion,
|
||||
};
|
||||
use ruff_python_parser::StringKind;
|
||||
use ruff_text_size::TextRange;
|
||||
|
||||
#[cfg(target_os = "windows")]
|
||||
#[global_allocator]
|
||||
static GLOBAL: mimalloc::MiMalloc = mimalloc::MiMalloc;
|
||||
|
||||
#[cfg(all(
|
||||
not(target_os = "windows"),
|
||||
not(target_os = "openbsd"),
|
||||
any(
|
||||
target_arch = "x86_64",
|
||||
target_arch = "aarch64",
|
||||
target_arch = "powerpc64"
|
||||
)
|
||||
))]
|
||||
#[global_allocator]
|
||||
static GLOBAL: tikv_jemallocator::Jemalloc = tikv_jemallocator::Jemalloc;
|
||||
|
||||
fn benchmark_parser(criterion: &mut Criterion<WallTime>) {
|
||||
let mut group = criterion.benchmark_group("parse");
|
||||
|
||||
let s = "\"\"\"Validate length based{ on BIN for major brands:
|
||||
https://en.wikipedia.org/wiki/Payment_card_number#Issuer_identification_number_(IIN)\"\"\"";
|
||||
|
||||
// group.bench_with_input("new_string", &s, |b, &s| {
|
||||
// b.iter_batched(
|
||||
// || s.to_string().into_boxed_str(),
|
||||
// |data| {
|
||||
// ruff_python_parser::string::parse_string_literal(
|
||||
// black_box(data),
|
||||
// StringKind::String,
|
||||
// true,
|
||||
// TextRange::default(),
|
||||
// )
|
||||
// },
|
||||
// BatchSize::SmallInput,
|
||||
// );
|
||||
// });
|
||||
//
|
||||
// group.bench_function("old_string", |b| {
|
||||
// b.iter_batched(
|
||||
// || s.to_string(),
|
||||
// |data| {
|
||||
// ruff_python_parser::old_string::parse_string_literal(
|
||||
// black_box(&data),
|
||||
// StringKind::String,
|
||||
// true,
|
||||
// TextRange::default(),
|
||||
// )
|
||||
// },
|
||||
// BatchSize::SmallInput,
|
||||
// );
|
||||
// });
|
||||
|
||||
let s = "Item {i+1}";
|
||||
|
||||
group.bench_with_input("new_fstring", &s, |b, &s| {
|
||||
b.iter_batched(
|
||||
|| s.to_string().into_boxed_str(),
|
||||
|data| {
|
||||
ruff_python_parser::string::parse_fstring_literal_element(
|
||||
black_box(data),
|
||||
true,
|
||||
TextRange::default(),
|
||||
)
|
||||
},
|
||||
BatchSize::SmallInput,
|
||||
);
|
||||
});
|
||||
|
||||
group.bench_function("old_fstring", |b| {
|
||||
b.iter_batched(
|
||||
|| s.to_string(),
|
||||
|data| {
|
||||
ruff_python_parser::old_string::parse_fstring_literal_element(
|
||||
black_box(&data),
|
||||
true,
|
||||
TextRange::default(),
|
||||
)
|
||||
},
|
||||
BatchSize::SmallInput,
|
||||
);
|
||||
});
|
||||
|
||||
group.finish();
|
||||
}
|
||||
|
||||
criterion_group!(parser, benchmark_parser);
|
||||
criterion_main!(parser);
|
||||
@@ -119,14 +119,15 @@ pub use token::{StringKind, Tok, TokenKind};
|
||||
|
||||
use crate::lexer::LexResult;
|
||||
|
||||
mod function;
|
||||
// Skip flattening lexer to distinguish from full ruff_python_parser
|
||||
mod context;
|
||||
mod function;
|
||||
mod invalid;
|
||||
// Skip flattening lexer to distinguish from full ruff_python_parser
|
||||
pub mod lexer;
|
||||
pub mod old_string;
|
||||
mod parser;
|
||||
mod soft_keywords;
|
||||
mod string;
|
||||
pub mod string;
|
||||
mod token;
|
||||
mod token_source;
|
||||
pub mod typing;
|
||||
|
||||
820
crates/ruff_python_parser/src/old_string.rs
Normal file
820
crates/ruff_python_parser/src/old_string.rs
Normal file
@@ -0,0 +1,820 @@
|
||||
//! Parsing of string literals, bytes literals, and implicit string concatenation.
|
||||
|
||||
use ruff_python_ast::{self as ast, Expr};
|
||||
use ruff_text_size::{Ranged, TextLen, TextRange, TextSize};
|
||||
|
||||
use crate::lexer::{LexicalError, LexicalErrorType};
|
||||
use crate::string::FStringError;
|
||||
use crate::token::{StringKind, Tok};
|
||||
|
||||
pub enum StringType {
|
||||
Str(ast::StringLiteral),
|
||||
Bytes(ast::BytesLiteral),
|
||||
FString(ast::FString),
|
||||
}
|
||||
|
||||
impl Ranged for StringType {
|
||||
fn range(&self) -> TextRange {
|
||||
match self {
|
||||
Self::Str(node) => node.range(),
|
||||
Self::Bytes(node) => node.range(),
|
||||
Self::FString(node) => node.range(),
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
impl From<StringType> for Expr {
|
||||
fn from(string: StringType) -> Self {
|
||||
match string {
|
||||
StringType::Str(node) => Expr::from(node),
|
||||
StringType::Bytes(node) => Expr::from(node),
|
||||
StringType::FString(node) => Expr::from(node),
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
struct StringParser<'a> {
|
||||
rest: &'a str,
|
||||
kind: StringKind,
|
||||
location: TextSize,
|
||||
range: TextRange,
|
||||
}
|
||||
|
||||
impl<'a> StringParser<'a> {
|
||||
fn new(source: &'a str, kind: StringKind, start: TextSize, range: TextRange) -> Self {
|
||||
Self {
|
||||
rest: source,
|
||||
kind,
|
||||
location: start,
|
||||
range,
|
||||
}
|
||||
}
|
||||
|
||||
#[inline]
|
||||
fn skip_bytes(&mut self, bytes: usize) -> &'a str {
|
||||
let skipped_str = &self.rest[..bytes];
|
||||
self.rest = &self.rest[bytes..];
|
||||
self.location += skipped_str.text_len();
|
||||
skipped_str
|
||||
}
|
||||
|
||||
#[inline]
|
||||
fn get_pos(&self) -> TextSize {
|
||||
self.location
|
||||
}
|
||||
|
||||
/// Returns the next byte in the string, if there is one.
|
||||
///
|
||||
/// # Panics
|
||||
///
|
||||
/// When the next byte is a part of a multi-byte character.
|
||||
#[inline]
|
||||
fn next_byte(&mut self) -> Option<u8> {
|
||||
self.rest.as_bytes().first().map(|&byte| {
|
||||
self.rest = &self.rest[1..];
|
||||
self.location += TextSize::new(1);
|
||||
byte
|
||||
})
|
||||
}
|
||||
|
||||
#[inline]
|
||||
fn next_char(&mut self) -> Option<char> {
|
||||
self.rest.chars().next().map(|c| {
|
||||
self.rest = &self.rest[c.len_utf8()..];
|
||||
self.location += c.text_len();
|
||||
c
|
||||
})
|
||||
}
|
||||
|
||||
#[inline]
|
||||
fn peek_byte(&self) -> Option<u8> {
|
||||
self.rest.as_bytes().first().copied()
|
||||
}
|
||||
|
||||
fn parse_unicode_literal(&mut self, literal_number: usize) -> Result<char, LexicalError> {
|
||||
let mut p: u32 = 0u32;
|
||||
let unicode_error = LexicalError::new(LexicalErrorType::UnicodeError, self.get_pos());
|
||||
for i in 1..=literal_number {
|
||||
match self.next_char() {
|
||||
Some(c) => match c.to_digit(16) {
|
||||
Some(d) => p += d << ((literal_number - i) * 4),
|
||||
None => return Err(unicode_error),
|
||||
},
|
||||
None => return Err(unicode_error),
|
||||
}
|
||||
}
|
||||
match p {
|
||||
0xD800..=0xDFFF => Ok(std::char::REPLACEMENT_CHARACTER),
|
||||
_ => std::char::from_u32(p).ok_or(unicode_error),
|
||||
}
|
||||
}
|
||||
|
||||
fn parse_octet(&mut self, o: u8) -> char {
|
||||
let mut radix_bytes = [o, 0, 0];
|
||||
let mut len = 1;
|
||||
|
||||
while len < 3 {
|
||||
let Some(b'0'..=b'7') = self.peek_byte() else {
|
||||
break;
|
||||
};
|
||||
|
||||
radix_bytes[len] = self.next_byte().unwrap();
|
||||
len += 1;
|
||||
}
|
||||
|
||||
// OK because radix_bytes is always going to be in the ASCII range.
|
||||
let radix_str = std::str::from_utf8(&radix_bytes[..len]).expect("ASCII bytes");
|
||||
let value = u32::from_str_radix(radix_str, 8).unwrap();
|
||||
char::from_u32(value).unwrap()
|
||||
}
|
||||
|
||||
fn parse_unicode_name(&mut self) -> Result<char, LexicalError> {
|
||||
let start_pos = self.get_pos();
|
||||
|
||||
let Some('{') = self.next_char() else {
|
||||
return Err(LexicalError::new(LexicalErrorType::StringError, start_pos));
|
||||
};
|
||||
|
||||
let start_pos = self.get_pos();
|
||||
let Some(close_idx) = self.rest.find('}') else {
|
||||
return Err(LexicalError::new(
|
||||
LexicalErrorType::StringError,
|
||||
self.get_pos(),
|
||||
));
|
||||
};
|
||||
|
||||
let name_and_ending = self.skip_bytes(close_idx + 1);
|
||||
let name = &name_and_ending[..name_and_ending.len() - 1];
|
||||
|
||||
unicode_names2::character(name)
|
||||
.ok_or_else(|| LexicalError::new(LexicalErrorType::UnicodeError, start_pos))
|
||||
}
|
||||
|
||||
fn parse_escaped_char(&mut self, string: &mut String) -> Result<(), LexicalError> {
|
||||
let Some(first_char) = self.next_char() else {
|
||||
return Err(LexicalError::new(
|
||||
LexicalErrorType::StringError,
|
||||
self.get_pos(),
|
||||
));
|
||||
};
|
||||
|
||||
let new_char = match first_char {
|
||||
'\\' => '\\',
|
||||
'\'' => '\'',
|
||||
'\"' => '"',
|
||||
'a' => '\x07',
|
||||
'b' => '\x08',
|
||||
'f' => '\x0c',
|
||||
'n' => '\n',
|
||||
'r' => '\r',
|
||||
't' => '\t',
|
||||
'v' => '\x0b',
|
||||
o @ '0'..='7' => self.parse_octet(o as u8),
|
||||
'x' => self.parse_unicode_literal(2)?,
|
||||
'u' if !self.kind.is_any_bytes() => self.parse_unicode_literal(4)?,
|
||||
'U' if !self.kind.is_any_bytes() => self.parse_unicode_literal(8)?,
|
||||
'N' if !self.kind.is_any_bytes() => self.parse_unicode_name()?,
|
||||
// Special cases where the escape sequence is not a single character
|
||||
'\n' => return Ok(()),
|
||||
'\r' => {
|
||||
if self.peek_byte() == Some(b'\n') {
|
||||
self.next_byte();
|
||||
}
|
||||
|
||||
return Ok(());
|
||||
}
|
||||
_ => {
|
||||
if self.kind.is_any_bytes() && !first_char.is_ascii() {
|
||||
return Err(LexicalError::new(
|
||||
LexicalErrorType::OtherError(
|
||||
"bytes can only contain ASCII literal characters"
|
||||
.to_string()
|
||||
.into_boxed_str(),
|
||||
),
|
||||
self.get_pos(),
|
||||
));
|
||||
}
|
||||
|
||||
string.push('\\');
|
||||
|
||||
first_char
|
||||
}
|
||||
};
|
||||
|
||||
string.push(new_char);
|
||||
|
||||
Ok(())
|
||||
}
|
||||
|
||||
fn parse_fstring_middle(&mut self) -> Result<ast::FStringElement, LexicalError> {
|
||||
let mut value = String::with_capacity(self.rest.len());
|
||||
while let Some(ch) = self.next_char() {
|
||||
match ch {
|
||||
// We can encounter a `\` as the last character in a `FStringMiddle`
|
||||
// token which is valid in this context. For example,
|
||||
//
|
||||
// ```python
|
||||
// f"\{foo} \{bar:\}"
|
||||
// # ^ ^^ ^
|
||||
// ```
|
||||
//
|
||||
// Here, the `FStringMiddle` token content will be "\" and " \"
|
||||
// which is invalid if we look at the content in isolation:
|
||||
//
|
||||
// ```python
|
||||
// "\"
|
||||
// ```
|
||||
//
|
||||
// However, the content is syntactically valid in the context of
|
||||
// the f-string because it's a substring of the entire f-string.
|
||||
// This is still an invalid escape sequence, but we don't want to
|
||||
// raise a syntax error as is done by the CPython parser. It might
|
||||
// be supported in the future, refer to point 3: https://peps.python.org/pep-0701/#rejected-ideas
|
||||
'\\' if !self.kind.is_raw() && self.peek_byte().is_some() => {
|
||||
self.parse_escaped_char(&mut value)?;
|
||||
}
|
||||
// If there are any curly braces inside a `FStringMiddle` token,
|
||||
// then they were escaped (i.e. `{{` or `}}`). This means that
|
||||
// we need increase the location by 2 instead of 1.
|
||||
ch @ ('{' | '}') => {
|
||||
self.location += ch.text_len();
|
||||
value.push(ch);
|
||||
}
|
||||
ch => value.push(ch),
|
||||
}
|
||||
}
|
||||
Ok(ast::FStringElement::Literal(ast::FStringLiteralElement {
|
||||
value: value.into_boxed_str(),
|
||||
range: self.range,
|
||||
}))
|
||||
}
|
||||
|
||||
fn parse_bytes(&mut self) -> Result<StringType, LexicalError> {
|
||||
let mut content = String::with_capacity(self.rest.len());
|
||||
while let Some(ch) = self.next_char() {
|
||||
match ch {
|
||||
'\\' if !self.kind.is_raw() => {
|
||||
self.parse_escaped_char(&mut content)?;
|
||||
}
|
||||
ch => {
|
||||
if !ch.is_ascii() {
|
||||
return Err(LexicalError::new(
|
||||
LexicalErrorType::OtherError(
|
||||
"bytes can only contain ASCII literal characters"
|
||||
.to_string()
|
||||
.into_boxed_str(),
|
||||
),
|
||||
self.get_pos(),
|
||||
));
|
||||
}
|
||||
content.push(ch);
|
||||
}
|
||||
}
|
||||
}
|
||||
Ok(StringType::Bytes(ast::BytesLiteral {
|
||||
value: content
|
||||
.chars()
|
||||
.map(|c| c as u8)
|
||||
.collect::<Vec<u8>>()
|
||||
.into_boxed_slice(),
|
||||
range: self.range,
|
||||
}))
|
||||
}
|
||||
|
||||
fn parse_string(&mut self) -> Result<StringType, LexicalError> {
|
||||
let mut value = String::with_capacity(self.rest.len());
|
||||
if self.kind.is_raw() {
|
||||
value.push_str(self.skip_bytes(self.rest.len()));
|
||||
} else {
|
||||
loop {
|
||||
let Some(escape_idx) = self.rest.find('\\') else {
|
||||
value.push_str(self.skip_bytes(self.rest.len()));
|
||||
break;
|
||||
};
|
||||
|
||||
let before_with_slash = self.skip_bytes(escape_idx + 1);
|
||||
let before = &before_with_slash[..before_with_slash.len() - 1];
|
||||
|
||||
value.push_str(before);
|
||||
self.parse_escaped_char(&mut value)?;
|
||||
}
|
||||
}
|
||||
Ok(StringType::Str(ast::StringLiteral {
|
||||
value: value.into_boxed_str(),
|
||||
unicode: self.kind.is_unicode(),
|
||||
range: self.range,
|
||||
}))
|
||||
}
|
||||
|
||||
fn parse(&mut self) -> Result<StringType, LexicalError> {
|
||||
if self.kind.is_any_bytes() {
|
||||
self.parse_bytes()
|
||||
} else {
|
||||
self.parse_string()
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
pub fn parse_string_literal(
|
||||
source: &str,
|
||||
kind: StringKind,
|
||||
triple_quoted: bool,
|
||||
range: TextRange,
|
||||
) -> Result<StringType, LexicalError> {
|
||||
let start_location = range.start()
|
||||
+ kind.prefix_len()
|
||||
+ if triple_quoted {
|
||||
TextSize::from(3)
|
||||
} else {
|
||||
TextSize::from(1)
|
||||
};
|
||||
StringParser::new(source, kind, start_location, range).parse()
|
||||
}
|
||||
|
||||
pub fn parse_fstring_literal_element(
|
||||
source: &str,
|
||||
is_raw: bool,
|
||||
range: TextRange,
|
||||
) -> Result<ast::FStringElement, LexicalError> {
|
||||
let kind = if is_raw {
|
||||
StringKind::RawString
|
||||
} else {
|
||||
StringKind::String
|
||||
};
|
||||
StringParser::new(source, kind, range.start(), range).parse_fstring_middle()
|
||||
}
|
||||
|
||||
pub(crate) fn concatenated_strings(
|
||||
strings: Vec<StringType>,
|
||||
range: TextRange,
|
||||
) -> Result<Expr, LexicalError> {
|
||||
#[cfg(debug_assertions)]
|
||||
debug_assert!(strings.len() > 1);
|
||||
|
||||
let mut has_fstring = false;
|
||||
let mut byte_literal_count = 0;
|
||||
for string in &strings {
|
||||
match string {
|
||||
StringType::FString(_) => has_fstring = true,
|
||||
StringType::Bytes(_) => byte_literal_count += 1,
|
||||
StringType::Str(_) => {}
|
||||
}
|
||||
}
|
||||
let has_bytes = byte_literal_count > 0;
|
||||
|
||||
if has_bytes && byte_literal_count < strings.len() {
|
||||
return Err(LexicalError::new(
|
||||
LexicalErrorType::OtherError(
|
||||
"cannot mix bytes and nonbytes literals"
|
||||
.to_string()
|
||||
.into_boxed_str(),
|
||||
),
|
||||
range.start(),
|
||||
));
|
||||
}
|
||||
|
||||
if has_bytes {
|
||||
let mut values = Vec::with_capacity(strings.len());
|
||||
for string in strings {
|
||||
match string {
|
||||
StringType::Bytes(value) => values.push(value),
|
||||
_ => unreachable!("Unexpected non-bytes literal."),
|
||||
}
|
||||
}
|
||||
return Ok(Expr::from(ast::ExprBytesLiteral {
|
||||
value: ast::BytesLiteralValue::concatenated(values),
|
||||
range,
|
||||
}));
|
||||
}
|
||||
|
||||
if !has_fstring {
|
||||
let mut values = Vec::with_capacity(strings.len());
|
||||
for string in strings {
|
||||
match string {
|
||||
StringType::Str(value) => values.push(value),
|
||||
_ => unreachable!("Unexpected non-string literal."),
|
||||
}
|
||||
}
|
||||
return Ok(Expr::from(ast::ExprStringLiteral {
|
||||
value: ast::StringLiteralValue::concatenated(values),
|
||||
range,
|
||||
}));
|
||||
}
|
||||
|
||||
let mut parts = Vec::with_capacity(strings.len());
|
||||
for string in strings {
|
||||
match string {
|
||||
StringType::FString(fstring) => parts.push(ast::FStringPart::FString(fstring)),
|
||||
StringType::Str(string) => parts.push(ast::FStringPart::Literal(string)),
|
||||
StringType::Bytes(_) => unreachable!("Unexpected bytes literal."),
|
||||
}
|
||||
}
|
||||
|
||||
Ok(ast::ExprFString {
|
||||
value: ast::FStringValue::concatenated(parts),
|
||||
range,
|
||||
}
|
||||
.into())
|
||||
}
|
||||
|
||||
/// Represents the different types of errors that can occur during parsing of an f-string.
|
||||
#[derive(Copy, Debug, Clone, PartialEq)]
|
||||
pub enum FStringErrorType {
|
||||
/// Expected a right brace after an opened left brace.
|
||||
UnclosedLbrace,
|
||||
/// An invalid conversion flag was encountered.
|
||||
InvalidConversionFlag,
|
||||
/// A single right brace was encountered.
|
||||
SingleRbrace,
|
||||
/// Unterminated string.
|
||||
UnterminatedString,
|
||||
/// Unterminated triple-quoted string.
|
||||
UnterminatedTripleQuotedString,
|
||||
// TODO(dhruvmanila): The parser can't catch all cases of this error, but
|
||||
// wherever it can, we'll display the correct error message.
|
||||
/// A lambda expression without parentheses was encountered.
|
||||
LambdaWithoutParentheses,
|
||||
}
|
||||
|
||||
impl std::fmt::Display for FStringErrorType {
|
||||
fn fmt(&self, f: &mut std::fmt::Formatter) -> std::fmt::Result {
|
||||
use FStringErrorType::{
|
||||
InvalidConversionFlag, LambdaWithoutParentheses, SingleRbrace, UnclosedLbrace,
|
||||
UnterminatedString, UnterminatedTripleQuotedString,
|
||||
};
|
||||
match self {
|
||||
UnclosedLbrace => write!(f, "expecting '}}'"),
|
||||
InvalidConversionFlag => write!(f, "invalid conversion character"),
|
||||
SingleRbrace => write!(f, "single '}}' is not allowed"),
|
||||
UnterminatedString => write!(f, "unterminated string"),
|
||||
UnterminatedTripleQuotedString => write!(f, "unterminated triple-quoted string"),
|
||||
LambdaWithoutParentheses => {
|
||||
write!(f, "lambda expressions are not allowed without parentheses")
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
#[cfg(test)]
|
||||
mod tests {
|
||||
use crate::lexer::LexicalErrorType;
|
||||
use crate::parser::parse_suite;
|
||||
use crate::{ParseErrorType, Suite};
|
||||
|
||||
use super::*;
|
||||
|
||||
const WINDOWS_EOL: &str = "\r\n";
|
||||
const MAC_EOL: &str = "\r";
|
||||
const UNIX_EOL: &str = "\n";
|
||||
|
||||
fn string_parser_escaped_eol(eol: &str) -> Suite {
|
||||
let source = format!(r"'text \{eol}more text'");
|
||||
parse_suite(&source).unwrap()
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn test_string_parser_escaped_unix_eol() {
|
||||
let parse_ast = string_parser_escaped_eol(UNIX_EOL);
|
||||
insta::assert_debug_snapshot!(parse_ast);
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn test_string_parser_escaped_mac_eol() {
|
||||
let parse_ast = string_parser_escaped_eol(MAC_EOL);
|
||||
insta::assert_debug_snapshot!(parse_ast);
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn test_string_parser_escaped_windows_eol() {
|
||||
let parse_ast = string_parser_escaped_eol(WINDOWS_EOL);
|
||||
insta::assert_debug_snapshot!(parse_ast);
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn test_parse_fstring() {
|
||||
let source = r#"f"{a}{ b }{{foo}}""#;
|
||||
let parse_ast = parse_suite(source).unwrap();
|
||||
|
||||
insta::assert_debug_snapshot!(parse_ast);
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn test_parse_fstring_nested_spec() {
|
||||
let source = r#"f"{foo:{spec}}""#;
|
||||
let parse_ast = parse_suite(source).unwrap();
|
||||
|
||||
insta::assert_debug_snapshot!(parse_ast);
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn test_parse_fstring_not_nested_spec() {
|
||||
let source = r#"f"{foo:spec}""#;
|
||||
let parse_ast = parse_suite(source).unwrap();
|
||||
|
||||
insta::assert_debug_snapshot!(parse_ast);
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn test_parse_empty_fstring() {
|
||||
insta::assert_debug_snapshot!(parse_suite(r#"f"""#,).unwrap());
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn test_fstring_parse_self_documenting_base() {
|
||||
let source = r#"f"{user=}""#;
|
||||
let parse_ast = parse_suite(source).unwrap();
|
||||
|
||||
insta::assert_debug_snapshot!(parse_ast);
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn test_fstring_parse_self_documenting_base_more() {
|
||||
let source = r#"f"mix {user=} with text and {second=}""#;
|
||||
let parse_ast = parse_suite(source).unwrap();
|
||||
|
||||
insta::assert_debug_snapshot!(parse_ast);
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn test_fstring_parse_self_documenting_format() {
|
||||
let source = r#"f"{user=:>10}""#;
|
||||
let parse_ast = parse_suite(source).unwrap();
|
||||
|
||||
insta::assert_debug_snapshot!(parse_ast);
|
||||
}
|
||||
|
||||
fn parse_fstring_error(source: &str) -> FStringErrorType {
|
||||
parse_suite(source)
|
||||
.map_err(|e| match e.error {
|
||||
ParseErrorType::Lexical(LexicalErrorType::FStringError(e)) => e,
|
||||
e => unreachable!("Expected FStringError: {:?}", e),
|
||||
})
|
||||
.expect_err("Expected error")
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn test_parse_invalid_fstring() {
|
||||
use FStringErrorType::{InvalidConversionFlag, LambdaWithoutParentheses};
|
||||
|
||||
assert_eq!(parse_fstring_error(r#"f"{5!x}""#), InvalidConversionFlag);
|
||||
assert_eq!(
|
||||
parse_fstring_error("f'{lambda x:{x}}'"),
|
||||
LambdaWithoutParentheses
|
||||
);
|
||||
assert_eq!(
|
||||
parse_fstring_error("f'{lambda x: {x}}'"),
|
||||
LambdaWithoutParentheses
|
||||
);
|
||||
assert!(parse_suite(r#"f"{class}""#,).is_err());
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn test_parse_fstring_not_equals() {
|
||||
let source = r#"f"{1 != 2}""#;
|
||||
let parse_ast = parse_suite(source).unwrap();
|
||||
insta::assert_debug_snapshot!(parse_ast);
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn test_parse_fstring_equals() {
|
||||
let source = r#"f"{42 == 42}""#;
|
||||
let parse_ast = parse_suite(source).unwrap();
|
||||
insta::assert_debug_snapshot!(parse_ast);
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn test_parse_fstring_self_doc_prec_space() {
|
||||
let source = r#"f"{x =}""#;
|
||||
let parse_ast = parse_suite(source).unwrap();
|
||||
insta::assert_debug_snapshot!(parse_ast);
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn test_parse_fstring_self_doc_trailing_space() {
|
||||
let source = r#"f"{x= }""#;
|
||||
let parse_ast = parse_suite(source).unwrap();
|
||||
insta::assert_debug_snapshot!(parse_ast);
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn test_parse_fstring_yield_expr() {
|
||||
let source = r#"f"{yield}""#;
|
||||
let parse_ast = parse_suite(source).unwrap();
|
||||
insta::assert_debug_snapshot!(parse_ast);
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn test_parse_string_concat() {
|
||||
let source = "'Hello ' 'world'";
|
||||
let parse_ast = parse_suite(source).unwrap();
|
||||
insta::assert_debug_snapshot!(parse_ast);
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn test_parse_u_string_concat_1() {
|
||||
let source = "'Hello ' u'world'";
|
||||
let parse_ast = parse_suite(source).unwrap();
|
||||
insta::assert_debug_snapshot!(parse_ast);
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn test_parse_u_string_concat_2() {
|
||||
let source = "u'Hello ' 'world'";
|
||||
let parse_ast = parse_suite(source).unwrap();
|
||||
insta::assert_debug_snapshot!(parse_ast);
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn test_parse_f_string_concat_1() {
|
||||
let source = "'Hello ' f'world'";
|
||||
let parse_ast = parse_suite(source).unwrap();
|
||||
insta::assert_debug_snapshot!(parse_ast);
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn test_parse_f_string_concat_2() {
|
||||
let source = "'Hello ' f'world'";
|
||||
let parse_ast = parse_suite(source).unwrap();
|
||||
insta::assert_debug_snapshot!(parse_ast);
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn test_parse_f_string_concat_3() {
|
||||
let source = "'Hello ' f'world{\"!\"}'";
|
||||
let parse_ast = parse_suite(source).unwrap();
|
||||
insta::assert_debug_snapshot!(parse_ast);
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn test_parse_f_string_concat_4() {
|
||||
let source = "'Hello ' f'world{\"!\"}' 'again!'";
|
||||
let parse_ast = parse_suite(source).unwrap();
|
||||
insta::assert_debug_snapshot!(parse_ast);
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn test_parse_u_f_string_concat_1() {
|
||||
let source = "u'Hello ' f'world'";
|
||||
let parse_ast = parse_suite(source).unwrap();
|
||||
insta::assert_debug_snapshot!(parse_ast);
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn test_parse_u_f_string_concat_2() {
|
||||
let source = "u'Hello ' f'world' '!'";
|
||||
let parse_ast = parse_suite(source).unwrap();
|
||||
insta::assert_debug_snapshot!(parse_ast);
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn test_parse_string_triple_quotes_with_kind() {
|
||||
let source = "u'''Hello, world!'''";
|
||||
let parse_ast = parse_suite(source).unwrap();
|
||||
insta::assert_debug_snapshot!(parse_ast);
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn test_single_quoted_byte() {
|
||||
// single quote
|
||||
let source = r##"b'\x00\x01\x02\x03\x04\x05\x06\x07\x08\t\n\x0b\x0c\r\x0e\x0f\x10\x11\x12\x13\x14\x15\x16\x17\x18\x19\x1a\x1b\x1c\x1d\x1e\x1f !"#$%&\'()*+,-./0123456789:;<=>?@ABCDEFGHIJKLMNOPQRSTUVWXYZ[\\]^_`abcdefghijklmnopqrstuvwxyz{|}~\x7f\x80\x81\x82\x83\x84\x85\x86\x87\x88\x89\x8a\x8b\x8c\x8d\x8e\x8f\x90\x91\x92\x93\x94\x95\x96\x97\x98\x99\x9a\x9b\x9c\x9d\x9e\x9f\xa0\xa1\xa2\xa3\xa4\xa5\xa6\xa7\xa8\xa9\xaa\xab\xac\xad\xae\xaf\xb0\xb1\xb2\xb3\xb4\xb5\xb6\xb7\xb8\xb9\xba\xbb\xbc\xbd\xbe\xbf\xc0\xc1\xc2\xc3\xc4\xc5\xc6\xc7\xc8\xc9\xca\xcb\xcc\xcd\xce\xcf\xd0\xd1\xd2\xd3\xd4\xd5\xd6\xd7\xd8\xd9\xda\xdb\xdc\xdd\xde\xdf\xe0\xe1\xe2\xe3\xe4\xe5\xe6\xe7\xe8\xe9\xea\xeb\xec\xed\xee\xef\xf0\xf1\xf2\xf3\xf4\xf5\xf6\xf7\xf8\xf9\xfa\xfb\xfc\xfd\xfe\xff'"##;
|
||||
let parse_ast = parse_suite(source).unwrap();
|
||||
insta::assert_debug_snapshot!(parse_ast);
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn test_double_quoted_byte() {
|
||||
// double quote
|
||||
let source = r##"b"\x00\x01\x02\x03\x04\x05\x06\x07\x08\t\n\x0b\x0c\r\x0e\x0f\x10\x11\x12\x13\x14\x15\x16\x17\x18\x19\x1a\x1b\x1c\x1d\x1e\x1f !\"#$%&'()*+,-./0123456789:;<=>?@ABCDEFGHIJKLMNOPQRSTUVWXYZ[\\]^_`abcdefghijklmnopqrstuvwxyz{|}~\x7f\x80\x81\x82\x83\x84\x85\x86\x87\x88\x89\x8a\x8b\x8c\x8d\x8e\x8f\x90\x91\x92\x93\x94\x95\x96\x97\x98\x99\x9a\x9b\x9c\x9d\x9e\x9f\xa0\xa1\xa2\xa3\xa4\xa5\xa6\xa7\xa8\xa9\xaa\xab\xac\xad\xae\xaf\xb0\xb1\xb2\xb3\xb4\xb5\xb6\xb7\xb8\xb9\xba\xbb\xbc\xbd\xbe\xbf\xc0\xc1\xc2\xc3\xc4\xc5\xc6\xc7\xc8\xc9\xca\xcb\xcc\xcd\xce\xcf\xd0\xd1\xd2\xd3\xd4\xd5\xd6\xd7\xd8\xd9\xda\xdb\xdc\xdd\xde\xdf\xe0\xe1\xe2\xe3\xe4\xe5\xe6\xe7\xe8\xe9\xea\xeb\xec\xed\xee\xef\xf0\xf1\xf2\xf3\xf4\xf5\xf6\xf7\xf8\xf9\xfa\xfb\xfc\xfd\xfe\xff""##;
|
||||
let parse_ast = parse_suite(source).unwrap();
|
||||
insta::assert_debug_snapshot!(parse_ast);
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn test_escape_char_in_byte_literal() {
|
||||
// backslash does not escape
|
||||
let source = r#"b"omkmok\Xaa""#; // spell-checker:ignore omkmok
|
||||
let parse_ast = parse_suite(source).unwrap();
|
||||
insta::assert_debug_snapshot!(parse_ast);
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn test_raw_byte_literal_1() {
|
||||
let source = r"rb'\x1z'";
|
||||
let parse_ast = parse_suite(source).unwrap();
|
||||
insta::assert_debug_snapshot!(parse_ast);
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn test_raw_byte_literal_2() {
|
||||
let source = r"rb'\\'";
|
||||
let parse_ast = parse_suite(source).unwrap();
|
||||
insta::assert_debug_snapshot!(parse_ast);
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn test_escape_octet() {
|
||||
let source = r"b'\43a\4\1234'";
|
||||
let parse_ast = parse_suite(source).unwrap();
|
||||
insta::assert_debug_snapshot!(parse_ast);
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn test_fstring_escaped_newline() {
|
||||
let source = r#"f"\n{x}""#;
|
||||
let parse_ast = parse_suite(source).unwrap();
|
||||
insta::assert_debug_snapshot!(parse_ast);
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn test_fstring_constant_range() {
|
||||
let source = r#"f"aaa{bbb}ccc{ddd}eee""#;
|
||||
let parse_ast = parse_suite(source).unwrap();
|
||||
insta::assert_debug_snapshot!(parse_ast);
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn test_fstring_unescaped_newline() {
|
||||
let source = r#"f"""
|
||||
{x}""""#;
|
||||
let parse_ast = parse_suite(source).unwrap();
|
||||
insta::assert_debug_snapshot!(parse_ast);
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn test_fstring_escaped_character() {
|
||||
let source = r#"f"\\{x}""#;
|
||||
let parse_ast = parse_suite(source).unwrap();
|
||||
insta::assert_debug_snapshot!(parse_ast);
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn test_raw_fstring() {
|
||||
let source = r#"rf"{x}""#;
|
||||
let parse_ast = parse_suite(source).unwrap();
|
||||
insta::assert_debug_snapshot!(parse_ast);
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn test_triple_quoted_raw_fstring() {
|
||||
let source = r#"rf"""{x}""""#;
|
||||
let parse_ast = parse_suite(source).unwrap();
|
||||
insta::assert_debug_snapshot!(parse_ast);
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn test_fstring_line_continuation() {
|
||||
let source = r#"rf"\
|
||||
{x}""#;
|
||||
let parse_ast = parse_suite(source).unwrap();
|
||||
insta::assert_debug_snapshot!(parse_ast);
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn test_parse_fstring_nested_string_spec() {
|
||||
let source = r#"f"{foo:{''}}""#;
|
||||
let parse_ast = parse_suite(source).unwrap();
|
||||
|
||||
insta::assert_debug_snapshot!(parse_ast);
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn test_parse_fstring_nested_concatenation_string_spec() {
|
||||
let source = r#"f"{foo:{'' ''}}""#;
|
||||
let parse_ast = parse_suite(source).unwrap();
|
||||
|
||||
insta::assert_debug_snapshot!(parse_ast);
|
||||
}
|
||||
|
||||
/// <https://github.com/astral-sh/ruff/issues/8355>
|
||||
#[test]
|
||||
fn test_dont_panic_on_8_in_octal_escape() {
|
||||
let source = r"bold = '\038[1m'";
|
||||
let parse_ast = parse_suite(source).unwrap();
|
||||
|
||||
insta::assert_debug_snapshot!(parse_ast);
|
||||
}
|
||||
|
||||
macro_rules! test_aliases_parse {
|
||||
($($name:ident: $alias:expr,)*) => {
|
||||
$(
|
||||
#[test]
|
||||
fn $name() {
|
||||
let source = format!(r#""\N{{{0}}}""#, $alias);
|
||||
let parse_ast = parse_suite(&source).unwrap();
|
||||
insta::assert_debug_snapshot!(parse_ast);
|
||||
}
|
||||
)*
|
||||
}
|
||||
}
|
||||
|
||||
test_aliases_parse! {
|
||||
test_backspace_alias: "BACKSPACE",
|
||||
test_bell_alias: "BEL",
|
||||
test_carriage_return_alias: "CARRIAGE RETURN",
|
||||
test_delete_alias: "DELETE",
|
||||
test_escape_alias: "ESCAPE",
|
||||
test_form_feed_alias: "FORM FEED",
|
||||
test_hts_alias: "HTS",
|
||||
test_character_tabulation_with_justification_alias: "CHARACTER TABULATION WITH JUSTIFICATION",
|
||||
}
|
||||
}
|
||||
@@ -1616,7 +1616,7 @@ StringLiteralOrFString: StringType = {
|
||||
StringLiteral: StringType = {
|
||||
<location:@L> <string:string> <end_location:@R> =>? {
|
||||
let (source, kind, triple_quoted) = string;
|
||||
Ok(parse_string_literal(&source, kind, triple_quoted, (location..end_location).into())?)
|
||||
Ok(parse_string_literal(source, kind, triple_quoted, (location..end_location).into())?)
|
||||
}
|
||||
};
|
||||
|
||||
@@ -1633,7 +1633,7 @@ FStringMiddlePattern: ast::FStringElement = {
|
||||
FStringReplacementField,
|
||||
<location:@L> <fstring_middle:fstring_middle> <end_location:@R> =>? {
|
||||
let (source, is_raw, _) = fstring_middle;
|
||||
Ok(parse_fstring_literal_element(&source, is_raw, (location..end_location).into())?)
|
||||
Ok(parse_fstring_literal_element(source, is_raw, (location..end_location).into())?)
|
||||
}
|
||||
};
|
||||
|
||||
|
||||
@@ -1,5 +1,5 @@
|
||||
// auto-generated: "lalrpop 0.20.0"
|
||||
// sha3: 02c60b5c591440061dda68775005d87a203b5448c205120bda1566a62fc2147c
|
||||
// sha3: d38cc0f2252a58db42d3bd63a102b537865992b3cf51d402cdb4828f48989c9d
|
||||
use ruff_text_size::{Ranged, TextLen, TextRange, TextSize};
|
||||
use ruff_python_ast::{self as ast, Int, IpyEscapeKind};
|
||||
use crate::{
|
||||
@@ -36369,7 +36369,7 @@ fn __action217<
|
||||
{
|
||||
{
|
||||
let (source, kind, triple_quoted) = string;
|
||||
Ok(parse_string_literal(&source, kind, triple_quoted, (location..end_location).into())?)
|
||||
Ok(parse_string_literal(source, kind, triple_quoted, (location..end_location).into())?)
|
||||
}
|
||||
}
|
||||
|
||||
@@ -36419,7 +36419,7 @@ fn __action220<
|
||||
{
|
||||
{
|
||||
let (source, is_raw, _) = fstring_middle;
|
||||
Ok(parse_fstring_literal_element(&source, is_raw, (location..end_location).into())?)
|
||||
Ok(parse_fstring_literal_element(source, is_raw, (location..end_location).into())?)
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
@@ -1,12 +1,14 @@
|
||||
//! Parsing of string literals, bytes literals, and implicit string concatenation.
|
||||
|
||||
use bstr::ByteSlice;
|
||||
|
||||
use ruff_python_ast::{self as ast, Expr};
|
||||
use ruff_text_size::{Ranged, TextLen, TextRange, TextSize};
|
||||
use ruff_text_size::{Ranged, TextRange, TextSize};
|
||||
|
||||
use crate::lexer::{LexicalError, LexicalErrorType};
|
||||
use crate::token::{StringKind, Tok};
|
||||
|
||||
pub(crate) enum StringType {
|
||||
pub enum StringType {
|
||||
Str(ast::StringLiteral),
|
||||
Bytes(ast::BytesLiteral),
|
||||
FString(ast::FString),
|
||||
@@ -32,34 +34,40 @@ impl From<StringType> for Expr {
|
||||
}
|
||||
}
|
||||
|
||||
struct StringParser<'a> {
|
||||
rest: &'a str,
|
||||
enum EscapedChar {
|
||||
Literal(char),
|
||||
Escape(char),
|
||||
}
|
||||
|
||||
struct StringParser {
|
||||
source: Box<str>,
|
||||
cursor: usize,
|
||||
kind: StringKind,
|
||||
location: TextSize,
|
||||
offset: TextSize,
|
||||
range: TextRange,
|
||||
}
|
||||
|
||||
impl<'a> StringParser<'a> {
|
||||
fn new(source: &'a str, kind: StringKind, start: TextSize, range: TextRange) -> Self {
|
||||
impl StringParser {
|
||||
fn new(source: Box<str>, kind: StringKind, offset: TextSize, range: TextRange) -> Self {
|
||||
Self {
|
||||
rest: source,
|
||||
source,
|
||||
cursor: 0,
|
||||
kind,
|
||||
location: start,
|
||||
offset,
|
||||
range,
|
||||
}
|
||||
}
|
||||
|
||||
#[inline]
|
||||
fn skip_bytes(&mut self, bytes: usize) -> &'a str {
|
||||
let skipped_str = &self.rest[..bytes];
|
||||
self.rest = &self.rest[bytes..];
|
||||
self.location += skipped_str.text_len();
|
||||
fn skip_bytes(&mut self, bytes: usize) -> &str {
|
||||
let skipped_str = &self.source[self.cursor..self.cursor + bytes];
|
||||
self.cursor += bytes;
|
||||
skipped_str
|
||||
}
|
||||
|
||||
#[inline]
|
||||
fn get_pos(&self) -> TextSize {
|
||||
self.location
|
||||
self.offset + TextSize::try_from(self.cursor).unwrap()
|
||||
}
|
||||
|
||||
/// Returns the next byte in the string, if there is one.
|
||||
@@ -69,25 +77,23 @@ impl<'a> StringParser<'a> {
|
||||
/// When the next byte is a part of a multi-byte character.
|
||||
#[inline]
|
||||
fn next_byte(&mut self) -> Option<u8> {
|
||||
self.rest.as_bytes().first().map(|&byte| {
|
||||
self.rest = &self.rest[1..];
|
||||
self.location += TextSize::new(1);
|
||||
self.source[self.cursor..].as_bytes().first().map(|&byte| {
|
||||
self.cursor += 1;
|
||||
byte
|
||||
})
|
||||
}
|
||||
|
||||
#[inline]
|
||||
fn next_char(&mut self) -> Option<char> {
|
||||
self.rest.chars().next().map(|c| {
|
||||
self.rest = &self.rest[c.len_utf8()..];
|
||||
self.location += c.text_len();
|
||||
self.source[self.cursor..].chars().next().map(|c| {
|
||||
self.cursor += c.len_utf8();
|
||||
c
|
||||
})
|
||||
}
|
||||
|
||||
#[inline]
|
||||
fn peek_byte(&self) -> Option<u8> {
|
||||
self.rest.as_bytes().first().copied()
|
||||
self.source[self.cursor..].as_bytes().first().copied()
|
||||
}
|
||||
|
||||
fn parse_unicode_literal(&mut self, literal_number: usize) -> Result<char, LexicalError> {
|
||||
@@ -135,7 +141,7 @@ impl<'a> StringParser<'a> {
|
||||
};
|
||||
|
||||
let start_pos = self.get_pos();
|
||||
let Some(close_idx) = self.rest.find('}') else {
|
||||
let Some(close_idx) = self.source[self.cursor..].find('}') else {
|
||||
return Err(LexicalError::new(
|
||||
LexicalErrorType::StringError,
|
||||
self.get_pos(),
|
||||
@@ -149,7 +155,8 @@ impl<'a> StringParser<'a> {
|
||||
.ok_or_else(|| LexicalError::new(LexicalErrorType::UnicodeError, start_pos))
|
||||
}
|
||||
|
||||
fn parse_escaped_char(&mut self, string: &mut String) -> Result<(), LexicalError> {
|
||||
/// Parse an escaped character, returning the new character.
|
||||
fn parse_escaped_char(&mut self) -> Result<Option<EscapedChar>, LexicalError> {
|
||||
let Some(first_char) = self.next_char() else {
|
||||
return Err(LexicalError::new(
|
||||
LexicalErrorType::StringError,
|
||||
@@ -174,13 +181,13 @@ impl<'a> StringParser<'a> {
|
||||
'U' if !self.kind.is_any_bytes() => self.parse_unicode_literal(8)?,
|
||||
'N' if !self.kind.is_any_bytes() => self.parse_unicode_name()?,
|
||||
// Special cases where the escape sequence is not a single character
|
||||
'\n' => return Ok(()),
|
||||
'\n' => return Ok(None),
|
||||
'\r' => {
|
||||
if self.peek_byte() == Some(b'\n') {
|
||||
self.next_byte();
|
||||
}
|
||||
|
||||
return Ok(());
|
||||
return Ok(None);
|
||||
}
|
||||
_ => {
|
||||
if self.kind.is_any_bytes() && !first_char.is_ascii() {
|
||||
@@ -194,21 +201,42 @@ impl<'a> StringParser<'a> {
|
||||
));
|
||||
}
|
||||
|
||||
string.push('\\');
|
||||
|
||||
first_char
|
||||
return Ok(Some(EscapedChar::Escape(first_char)));
|
||||
}
|
||||
};
|
||||
|
||||
string.push(new_char);
|
||||
|
||||
Ok(())
|
||||
Ok(Some(EscapedChar::Literal(new_char)))
|
||||
}
|
||||
|
||||
fn parse_fstring_middle(&mut self) -> Result<ast::FStringElement, LexicalError> {
|
||||
let mut value = String::with_capacity(self.rest.len());
|
||||
while let Some(ch) = self.next_char() {
|
||||
match ch {
|
||||
fn parse_fstring_middle(mut self) -> Result<ast::FStringElement, LexicalError> {
|
||||
// Fast-path: if the f-string doesn't contain any escape sequences, return the literal.
|
||||
let Some(mut index) = memchr::memchr3(b'{', b'}', b'\\', self.source.as_bytes()) else {
|
||||
return Ok(ast::FStringElement::Literal(ast::FStringLiteralElement {
|
||||
value: self.source,
|
||||
range: self.range,
|
||||
}));
|
||||
};
|
||||
|
||||
let mut value = String::with_capacity(self.source.len());
|
||||
loop {
|
||||
// Add the characters before the escape sequence (or curly brace) to the string.
|
||||
let before_with_slash_or_brace = self.skip_bytes(index + 1);
|
||||
let before = &before_with_slash_or_brace[..before_with_slash_or_brace.len() - 1];
|
||||
value.push_str(before);
|
||||
|
||||
// Add the escaped character to the string.
|
||||
match &self.source.as_bytes()[self.cursor - 1] {
|
||||
// If there are any curly braces inside a `FStringMiddle` token,
|
||||
// then they were escaped (i.e. `{{` or `}}`). This means that
|
||||
// we need increase the location by 2 instead of 1.
|
||||
b'{' => {
|
||||
self.offset += TextSize::from(1);
|
||||
value.push('{');
|
||||
}
|
||||
b'}' => {
|
||||
self.offset += TextSize::from(1);
|
||||
value.push('}');
|
||||
}
|
||||
// We can encounter a `\` as the last character in a `FStringMiddle`
|
||||
// token which is valid in this context. For example,
|
||||
//
|
||||
@@ -229,71 +257,152 @@ impl<'a> StringParser<'a> {
|
||||
// This is still an invalid escape sequence, but we don't want to
|
||||
// raise a syntax error as is done by the CPython parser. It might
|
||||
// be supported in the future, refer to point 3: https://peps.python.org/pep-0701/#rejected-ideas
|
||||
'\\' if !self.kind.is_raw() && self.peek_byte().is_some() => {
|
||||
self.parse_escaped_char(&mut value)?;
|
||||
}
|
||||
// If there are any curly braces inside a `FStringMiddle` token,
|
||||
// then they were escaped (i.e. `{{` or `}}`). This means that
|
||||
// we need increase the location by 2 instead of 1.
|
||||
ch @ ('{' | '}') => {
|
||||
self.location += ch.text_len();
|
||||
value.push(ch);
|
||||
}
|
||||
ch => value.push(ch),
|
||||
}
|
||||
}
|
||||
Ok(ast::FStringElement::Literal(ast::FStringLiteralElement {
|
||||
value,
|
||||
range: self.range,
|
||||
}))
|
||||
}
|
||||
|
||||
fn parse_bytes(&mut self) -> Result<StringType, LexicalError> {
|
||||
let mut content = String::with_capacity(self.rest.len());
|
||||
while let Some(ch) = self.next_char() {
|
||||
match ch {
|
||||
'\\' if !self.kind.is_raw() => {
|
||||
self.parse_escaped_char(&mut content)?;
|
||||
b'\\' if !self.kind.is_raw() && self.peek_byte().is_some() => {
|
||||
match self.parse_escaped_char()? {
|
||||
None => {}
|
||||
Some(EscapedChar::Literal(c)) => value.push(c),
|
||||
Some(EscapedChar::Escape(c)) => {
|
||||
value.push('\\');
|
||||
value.push(c);
|
||||
}
|
||||
}
|
||||
}
|
||||
ch => {
|
||||
if !ch.is_ascii() {
|
||||
return Err(LexicalError::new(
|
||||
LexicalErrorType::OtherError(
|
||||
"bytes can only contain ASCII literal characters"
|
||||
.to_string()
|
||||
.into_boxed_str(),
|
||||
),
|
||||
self.get_pos(),
|
||||
));
|
||||
}
|
||||
content.push(ch);
|
||||
value.push(char::from(*ch));
|
||||
}
|
||||
}
|
||||
|
||||
let Some(next_index) =
|
||||
memchr::memchr3(b'{', b'}', b'\\', self.source[self.cursor..].as_bytes())
|
||||
else {
|
||||
// Add the rest of the string to the value.
|
||||
let rest = &self.source[self.cursor..];
|
||||
value.push_str(rest);
|
||||
break;
|
||||
};
|
||||
|
||||
index = next_index;
|
||||
}
|
||||
Ok(StringType::Bytes(ast::BytesLiteral {
|
||||
value: content.chars().map(|c| c as u8).collect::<Vec<u8>>(),
|
||||
|
||||
Ok(ast::FStringElement::Literal(ast::FStringLiteralElement {
|
||||
value: value.into_boxed_str(),
|
||||
range: self.range,
|
||||
}))
|
||||
}
|
||||
|
||||
fn parse_string(&mut self) -> Result<StringType, LexicalError> {
|
||||
let mut value = String::with_capacity(self.rest.len());
|
||||
if self.kind.is_raw() {
|
||||
value.push_str(self.skip_bytes(self.rest.len()));
|
||||
} else {
|
||||
loop {
|
||||
let Some(escape_idx) = self.rest.find('\\') else {
|
||||
value.push_str(self.skip_bytes(self.rest.len()));
|
||||
break;
|
||||
};
|
||||
|
||||
let before_with_slash = self.skip_bytes(escape_idx + 1);
|
||||
let before = &before_with_slash[..before_with_slash.len() - 1];
|
||||
|
||||
value.push_str(before);
|
||||
self.parse_escaped_char(&mut value)?;
|
||||
}
|
||||
fn parse_bytes(mut self) -> Result<StringType, LexicalError> {
|
||||
if let Some(index) = self.source.as_bytes().find_non_ascii_byte() {
|
||||
return Err(LexicalError::new(
|
||||
LexicalErrorType::OtherError(
|
||||
"bytes can only contain ASCII literal characters"
|
||||
.to_string()
|
||||
.into_boxed_str(),
|
||||
),
|
||||
self.offset + TextSize::try_from(index).unwrap(),
|
||||
));
|
||||
}
|
||||
|
||||
if self.kind.is_raw() {
|
||||
// For raw strings, no escaping is necessary.
|
||||
return Ok(StringType::Bytes(ast::BytesLiteral {
|
||||
value: self.source.into_boxed_bytes(),
|
||||
range: self.range,
|
||||
}));
|
||||
}
|
||||
|
||||
let Some(mut escape) = memchr::memchr(b'\\', self.source.as_bytes()) else {
|
||||
// If the string doesn't contain any escape sequences, return the owned string.
|
||||
return Ok(StringType::Bytes(ast::BytesLiteral {
|
||||
value: self.source.into_boxed_bytes(),
|
||||
range: self.range,
|
||||
}));
|
||||
};
|
||||
|
||||
// If the string contains escape sequences, we need to parse them.
|
||||
let mut value = Vec::with_capacity(self.source.len());
|
||||
loop {
|
||||
// Add the characters before the escape sequence to the string.
|
||||
let before_with_slash = self.skip_bytes(escape + 1);
|
||||
let before = &before_with_slash[..before_with_slash.len() - 1];
|
||||
value.extend_from_slice(before.as_bytes());
|
||||
|
||||
// Add the escaped character to the string.
|
||||
match self.parse_escaped_char()? {
|
||||
None => {}
|
||||
Some(EscapedChar::Literal(c)) => value.push(c as u8),
|
||||
Some(EscapedChar::Escape(c)) => {
|
||||
value.push(b'\\');
|
||||
value.push(c as u8);
|
||||
}
|
||||
}
|
||||
|
||||
let Some(next_escape) = memchr::memchr(b'\\', self.source[self.cursor..].as_bytes())
|
||||
else {
|
||||
// Add the rest of the string to the value.
|
||||
let rest = &self.source[self.cursor..];
|
||||
value.extend_from_slice(rest.as_bytes());
|
||||
break;
|
||||
};
|
||||
|
||||
// Update the position of the next escape sequence.
|
||||
escape = next_escape;
|
||||
}
|
||||
|
||||
Ok(StringType::Bytes(ast::BytesLiteral {
|
||||
value: value.into_boxed_slice(),
|
||||
range: self.range,
|
||||
}))
|
||||
}
|
||||
|
||||
fn parse_string(mut self) -> Result<StringType, LexicalError> {
|
||||
if self.kind.is_raw() {
|
||||
// For raw strings, no escaping is necessary.
|
||||
return Ok(StringType::Str(ast::StringLiteral {
|
||||
value: self.source,
|
||||
unicode: self.kind.is_unicode(),
|
||||
range: self.range,
|
||||
}));
|
||||
}
|
||||
|
||||
let Some(mut escape) = memchr::memchr(b'\\', self.source.as_bytes()) else {
|
||||
// If the string doesn't contain any escape sequences, return the owned string.
|
||||
return Ok(StringType::Str(ast::StringLiteral {
|
||||
value: self.source,
|
||||
unicode: self.kind.is_unicode(),
|
||||
range: self.range,
|
||||
}));
|
||||
};
|
||||
|
||||
// If the string contains escape sequences, we need to parse them.
|
||||
let mut value = String::with_capacity(self.source.len());
|
||||
|
||||
loop {
|
||||
// Add the characters before the escape sequence to the string.
|
||||
let before_with_slash = self.skip_bytes(escape + 1);
|
||||
let before = &before_with_slash[..before_with_slash.len() - 1];
|
||||
value.push_str(before);
|
||||
|
||||
// Add the escaped character to the string.
|
||||
match self.parse_escaped_char()? {
|
||||
None => {}
|
||||
Some(EscapedChar::Literal(c)) => value.push(c),
|
||||
Some(EscapedChar::Escape(c)) => {
|
||||
value.push('\\');
|
||||
value.push(c);
|
||||
}
|
||||
}
|
||||
|
||||
let Some(next_escape) = self.source[self.cursor..].find('\\') else {
|
||||
// Add the rest of the string to the value.
|
||||
let rest = &self.source[self.cursor..];
|
||||
value.push_str(rest);
|
||||
break;
|
||||
};
|
||||
|
||||
// Update the position of the next escape sequence.
|
||||
escape = next_escape;
|
||||
}
|
||||
|
||||
Ok(StringType::Str(ast::StringLiteral {
|
||||
value: value.into_boxed_str(),
|
||||
unicode: self.kind.is_unicode(),
|
||||
@@ -301,7 +410,7 @@ impl<'a> StringParser<'a> {
|
||||
}))
|
||||
}
|
||||
|
||||
fn parse(&mut self) -> Result<StringType, LexicalError> {
|
||||
fn parse(self) -> Result<StringType, LexicalError> {
|
||||
if self.kind.is_any_bytes() {
|
||||
self.parse_bytes()
|
||||
} else {
|
||||
@@ -310,8 +419,8 @@ impl<'a> StringParser<'a> {
|
||||
}
|
||||
}
|
||||
|
||||
pub(crate) fn parse_string_literal(
|
||||
source: &str,
|
||||
pub fn parse_string_literal(
|
||||
source: Box<str>,
|
||||
kind: StringKind,
|
||||
triple_quoted: bool,
|
||||
range: TextRange,
|
||||
@@ -326,8 +435,8 @@ pub(crate) fn parse_string_literal(
|
||||
StringParser::new(source, kind, start_location, range).parse()
|
||||
}
|
||||
|
||||
pub(crate) fn parse_fstring_literal_element(
|
||||
source: &str,
|
||||
pub fn parse_fstring_literal_element(
|
||||
source: Box<str>,
|
||||
is_raw: bool,
|
||||
range: TextRange,
|
||||
) -> Result<ast::FStringElement, LexicalError> {
|
||||
@@ -360,7 +469,7 @@ pub(crate) fn concatenated_strings(
|
||||
if has_bytes && byte_literal_count < strings.len() {
|
||||
return Err(LexicalError::new(
|
||||
LexicalErrorType::OtherError(
|
||||
"cannot mix bytes and nonbytes literals"
|
||||
"cannot mix bytes and non-bytes literals"
|
||||
.to_string()
|
||||
.into_boxed_str(),
|
||||
),
|
||||
@@ -415,7 +524,7 @@ pub(crate) fn concatenated_strings(
|
||||
// TODO: consolidate these with ParseError
|
||||
/// An error that occurred during parsing of an f-string.
|
||||
#[derive(Debug, Clone, PartialEq)]
|
||||
struct FStringError {
|
||||
pub(crate) struct FStringError {
|
||||
/// The type of error that occurred.
|
||||
pub(crate) error: FStringErrorType,
|
||||
/// The location of the error.
|
||||
|
||||
Reference in New Issue
Block a user