From 025768d30344b275e9caa2f72a17db8fa4a63ccd Mon Sep 17 00:00:00 2001 From: Dhruv Manilawala Date: Tue, 14 May 2024 22:15:04 +0530 Subject: [PATCH] Add `Tokens` newtype wrapper, `TokenKind` iterator (#11361) ## Summary Alternative to #11237 This PR adds a new `Tokens` struct which is a newtype wrapper around a vector of lexer output. This allows us to add a `kinds` method which returns an iterator over the corresponding `TokenKind`. This iterator is implemented as a separate `TokenKindIter` struct to allow using the type and provide additional methods like `peek` directly on the iterator. This exposes the linter to access the stream of `TokenKind` instead of `Tok`. Edit: I've made the necessary downstream changes and plan to merge the entire stack at once. --- crates/ruff_linter/src/importer/insertion.rs | 5 +- crates/ruff_linter/src/linter.rs | 21 +++- crates/ruff_linter/src/rules/pyflakes/mod.rs | 3 +- crates/ruff_linter/src/test.rs | 6 +- crates/ruff_python_parser/src/lib.rs | 118 +++++++++++++++++- crates/ruff_python_parser/src/token.rs | 5 + .../tests/block_comments.rs | 3 +- crates/ruff_server/src/lint.rs | 3 +- crates/ruff_wasm/src/lib.rs | 3 +- 9 files changed, 142 insertions(+), 25 deletions(-) diff --git a/crates/ruff_linter/src/importer/insertion.rs b/crates/ruff_linter/src/importer/insertion.rs index 5cd6ae200c..274147a756 100644 --- a/crates/ruff_linter/src/importer/insertion.rs +++ b/crates/ruff_linter/src/importer/insertion.rs @@ -321,7 +321,6 @@ mod tests { use ruff_python_ast::PySourceType; use ruff_python_codegen::Stylist; - use ruff_python_parser::lexer::LexResult; use ruff_python_parser::{parse_suite, Mode}; use ruff_source_file::{LineEnding, Locator}; use ruff_text_size::TextSize; @@ -332,7 +331,7 @@ mod tests { fn start_of_file() -> Result<()> { fn insert(contents: &str) -> Result { let program = parse_suite(contents)?; - let tokens: Vec = ruff_python_parser::tokenize(contents, Mode::Module); + let tokens = ruff_python_parser::tokenize(contents, Mode::Module); let locator = Locator::new(contents); let stylist = Stylist::from_tokens(&tokens, &locator); Ok(Insertion::start_of_file(&program, &locator, &stylist)) @@ -443,7 +442,7 @@ x = 1 #[test] fn start_of_block() { fn insert(contents: &str, offset: TextSize) -> Insertion { - let tokens: Vec = ruff_python_parser::tokenize(contents, Mode::Module); + let tokens = ruff_python_parser::tokenize(contents, Mode::Module); let locator = Locator::new(contents); let stylist = Stylist::from_tokens(&tokens, &locator); Insertion::start_of_block(offset, &locator, &stylist, PySourceType::default()) diff --git a/crates/ruff_linter/src/linter.rs b/crates/ruff_linter/src/linter.rs index cb3ac3318b..7033dbab97 100644 --- a/crates/ruff_linter/src/linter.rs +++ b/crates/ruff_linter/src/linter.rs @@ -14,7 +14,7 @@ use ruff_python_ast::{PySourceType, Suite}; use ruff_python_codegen::Stylist; use ruff_python_index::Indexer; use ruff_python_parser::lexer::LexResult; -use ruff_python_parser::{AsMode, ParseError}; +use ruff_python_parser::{AsMode, ParseError, TokenKindIter, Tokens}; use ruff_source_file::{Locator, SourceFileBuilder}; use ruff_text_size::Ranged; @@ -353,7 +353,7 @@ pub fn add_noqa_to_path( let contents = source_kind.source_code(); // Tokenize once. - let tokens: Vec = ruff_python_parser::tokenize(contents, source_type.as_mode()); + let tokens = ruff_python_parser::tokenize(contents, source_type.as_mode()); // Map row and column locations to byte slices (lazily). let locator = Locator::new(contents); @@ -518,8 +518,7 @@ pub fn lint_fix<'a>( // Continuously fix until the source code stabilizes. loop { // Tokenize once. - let tokens: Vec = - ruff_python_parser::tokenize(transformed.source_code(), source_type.as_mode()); + let tokens = ruff_python_parser::tokenize(transformed.source_code(), source_type.as_mode()); // Map row and column locations to byte slices (lazily). let locator = Locator::new(transformed.source_code()); @@ -715,7 +714,7 @@ impl<'a> ParseSource<'a> { #[derive(Debug, Clone)] pub enum TokenSource<'a> { /// Use the precomputed tokens to generate the AST. - Tokens(Vec), + Tokens(Tokens), /// Use the precomputed tokens and AST. Precomputed { tokens: &'a [LexResult], @@ -723,6 +722,18 @@ pub enum TokenSource<'a> { }, } +impl TokenSource<'_> { + /// Returns an iterator over the [`TokenKind`] and the corresponding range. + /// + /// [`TokenKind`]: ruff_python_parser::TokenKind + pub fn kinds(&self) -> TokenKindIter { + match self { + TokenSource::Tokens(tokens) => tokens.kinds(), + TokenSource::Precomputed { tokens, .. } => TokenKindIter::new(tokens), + } + } +} + impl Deref for TokenSource<'_> { type Target = [LexResult]; diff --git a/crates/ruff_linter/src/rules/pyflakes/mod.rs b/crates/ruff_linter/src/rules/pyflakes/mod.rs index 8ef30efbdc..b4f3618098 100644 --- a/crates/ruff_linter/src/rules/pyflakes/mod.rs +++ b/crates/ruff_linter/src/rules/pyflakes/mod.rs @@ -11,7 +11,6 @@ mod tests { use anyhow::Result; use regex::Regex; - use ruff_python_parser::lexer::LexResult; use test_case::test_case; @@ -591,7 +590,7 @@ mod tests { let source_type = PySourceType::default(); let source_kind = SourceKind::Python(contents.to_string()); let settings = LinterSettings::for_rules(Linter::Pyflakes.rules()); - let tokens: Vec = ruff_python_parser::tokenize(&contents, source_type.as_mode()); + let tokens = ruff_python_parser::tokenize(&contents, source_type.as_mode()); let locator = Locator::new(&contents); let stylist = Stylist::from_tokens(&tokens, &locator); let indexer = Indexer::from_tokens(&tokens, &locator); diff --git a/crates/ruff_linter/src/test.rs b/crates/ruff_linter/src/test.rs index d83c5f5403..63a69f3857 100644 --- a/crates/ruff_linter/src/test.rs +++ b/crates/ruff_linter/src/test.rs @@ -16,7 +16,6 @@ use ruff_notebook::NotebookError; use ruff_python_ast::PySourceType; use ruff_python_codegen::Stylist; use ruff_python_index::Indexer; -use ruff_python_parser::lexer::LexResult; use ruff_python_parser::AsMode; use ruff_python_trivia::textwrap::dedent; use ruff_source_file::{Locator, SourceFileBuilder}; @@ -111,8 +110,7 @@ pub(crate) fn test_contents<'a>( settings: &LinterSettings, ) -> (Vec, Cow<'a, SourceKind>) { let source_type = PySourceType::from(path); - let tokens: Vec = - ruff_python_parser::tokenize(source_kind.source_code(), source_type.as_mode()); + let tokens = ruff_python_parser::tokenize(source_kind.source_code(), source_type.as_mode()); let locator = Locator::new(source_kind.source_code()); let stylist = Stylist::from_tokens(&tokens, &locator); let indexer = Indexer::from_tokens(&tokens, &locator); @@ -177,7 +175,7 @@ pub(crate) fn test_contents<'a>( transformed = Cow::Owned(transformed.updated(fixed_contents, &source_map)); - let tokens: Vec = + let tokens = ruff_python_parser::tokenize(transformed.source_code(), source_type.as_mode()); let locator = Locator::new(transformed.source_code()); let stylist = Stylist::from_tokens(&tokens, &locator); diff --git a/crates/ruff_python_parser/src/lib.rs b/crates/ruff_python_parser/src/lib.rs index ee7a7399fd..012a72d460 100644 --- a/crates/ruff_python_parser/src/lib.rs +++ b/crates/ruff_python_parser/src/lib.rs @@ -110,6 +110,9 @@ //! [parsing]: https://en.wikipedia.org/wiki/Parsing //! [lexer]: crate::lexer +use std::iter::FusedIterator; +use std::ops::Deref; + use crate::lexer::{lex, lex_starts_at, LexResult}; pub use crate::error::{FStringErrorType, ParseError, ParseErrorType}; @@ -117,7 +120,7 @@ pub use crate::parser::Program; pub use crate::token::{Tok, TokenKind}; use ruff_python_ast::{Expr, Mod, ModModule, PySourceType, Suite}; -use ruff_text_size::TextSize; +use ruff_text_size::{Ranged, TextRange, TextSize}; mod error; pub mod lexer; @@ -339,8 +342,113 @@ pub fn parse_tokens(tokens: Vec, source: &str, mode: Mode) -> Result< } } +/// Tokens represents a vector of [`LexResult`]. +/// +/// This should only include tokens up to and including the first error. This struct is created +/// by the [`tokenize`] function. +#[derive(Debug, Clone)] +pub struct Tokens(Vec); + +impl Tokens { + /// Returns an iterator over the [`TokenKind`] and the range corresponding to the tokens. + pub fn kinds(&self) -> TokenKindIter { + TokenKindIter::new(&self.0) + } + + /// Returns an iterator over the [`TokenKind`] and its range for all the tokens that are + /// within the given `range`. + /// + /// The start and end position of the given range should correspond to the start position of + /// the first token and the end position of the last token in the returned iterator. + /// + /// For example, if the struct contains the following tokens: + /// ```txt + /// (Def, 0..3) + /// (Name, 4..7) + /// (Lpar, 7..8) + /// (Rpar, 8..9) + /// (Colon, 9..10) + /// (Ellipsis, 11..14) + /// (Newline, 14..14) + /// ``` + /// + /// Then, the range `4..10` returns an iterator which yields `Name`, `Lpar`, `Rpar`, and + /// `Colon` token. But, if the given position doesn't match any of the tokens, an empty + /// iterator is returned. + pub fn kinds_within_range(&self, ranged: T) -> TokenKindIter { + let Ok(start_index) = self.binary_search_by_key(&ranged.start(), |result| match result { + Ok((_, range)) => range.start(), + Err(error) => error.location().start(), + }) else { + return TokenKindIter::default(); + }; + + let Ok(end_index) = self.binary_search_by_key(&ranged.end(), |result| match result { + Ok((_, range)) => range.end(), + Err(error) => error.location().end(), + }) else { + return TokenKindIter::default(); + }; + + TokenKindIter::new(self.get(start_index..=end_index).unwrap_or(&[])) + } + + /// Consumes the [`Tokens`], returning the underlying vector of [`LexResult`]. + pub fn into_inner(self) -> Vec { + self.0 + } +} + +impl Deref for Tokens { + type Target = [LexResult]; + + fn deref(&self) -> &Self::Target { + &self.0 + } +} + +/// An iterator over the [`TokenKind`] and the corresponding range. +/// +/// This struct is created by the [`Tokens::kinds`] method. +#[derive(Clone, Default)] +pub struct TokenKindIter<'a> { + inner: std::iter::Flatten>, +} + +impl<'a> TokenKindIter<'a> { + /// Create a new iterator from a slice of [`LexResult`]. + pub fn new(tokens: &'a [LexResult]) -> Self { + Self { + inner: tokens.iter().flatten(), + } + } + + /// Return the next value without advancing the iterator. + pub fn peek(&mut self) -> Option<(TokenKind, TextRange)> { + self.clone().next() + } +} + +impl Iterator for TokenKindIter<'_> { + type Item = (TokenKind, TextRange); + + fn next(&mut self) -> Option { + let &(ref tok, range) = self.inner.next()?; + Some((TokenKind::from_token(tok), range)) + } +} + +impl FusedIterator for TokenKindIter<'_> {} + +impl DoubleEndedIterator for TokenKindIter<'_> { + fn next_back(&mut self) -> Option { + let &(ref tok, range) = self.inner.next_back()?; + Some((TokenKind::from_token(tok), range)) + } +} + /// Collect tokens up to and including the first error. -pub fn tokenize(contents: &str, mode: Mode) -> Vec { +pub fn tokenize(contents: &str, mode: Mode) -> Tokens { let mut tokens: Vec = allocate_tokens_vec(contents); for tok in lexer::lex(contents, mode) { let is_err = tok.is_err(); @@ -350,7 +458,7 @@ pub fn tokenize(contents: &str, mode: Mode) -> Vec { } } - tokens + Tokens(tokens) } /// Tokenizes all tokens. @@ -380,7 +488,7 @@ fn approximate_tokens_lower_bound(contents: &str) -> usize { /// Parse a full Python program from its tokens. pub fn parse_program_tokens( - tokens: Vec, + tokens: Tokens, source: &str, is_jupyter_notebook: bool, ) -> anyhow::Result { @@ -389,7 +497,7 @@ pub fn parse_program_tokens( } else { Mode::Module }; - match parse_tokens(tokens, source, mode)? { + match parse_tokens(tokens.into_inner(), source, mode)? { Mod::Module(m) => Ok(m.body), Mod::Expression(_) => unreachable!("Mode::Module doesn't return other variant"), } diff --git a/crates/ruff_python_parser/src/token.rs b/crates/ruff_python_parser/src/token.rs index 36359dc2f0..146ac071ef 100644 --- a/crates/ruff_python_parser/src/token.rs +++ b/crates/ruff_python_parser/src/token.rs @@ -228,6 +228,11 @@ pub enum Tok { } impl Tok { + #[inline] + pub fn kind(&self) -> TokenKind { + TokenKind::from_token(self) + } + pub fn start_marker(mode: Mode) -> Self { match mode { Mode::Module | Mode::Ipython => Tok::StartModule, diff --git a/crates/ruff_python_trivia_integration_tests/tests/block_comments.rs b/crates/ruff_python_trivia_integration_tests/tests/block_comments.rs index df0142b3c1..fe6cc47ac9 100644 --- a/crates/ruff_python_trivia_integration_tests/tests/block_comments.rs +++ b/crates/ruff_python_trivia_integration_tests/tests/block_comments.rs @@ -1,5 +1,4 @@ use ruff_python_index::Indexer; -use ruff_python_parser::lexer::LexResult; use ruff_python_parser::{tokenize, Mode}; use ruff_source_file::Locator; use ruff_text_size::TextSize; @@ -38,7 +37,7 @@ fn block_comments_indented_block() { fn block_comments_single_line_is_not_a_block() { // arrange let source = "\n"; - let tokens: Vec = tokenize(source, Mode::Module); + let tokens = tokenize(source, Mode::Module); let locator = Locator::new(source); let indexer = Indexer::from_tokens(&tokens, &locator); diff --git a/crates/ruff_server/src/lint.rs b/crates/ruff_server/src/lint.rs index 887fe27226..159c87d8cb 100644 --- a/crates/ruff_server/src/lint.rs +++ b/crates/ruff_server/src/lint.rs @@ -13,7 +13,6 @@ use ruff_linter::{ use ruff_python_ast::PySourceType; use ruff_python_codegen::Stylist; use ruff_python_index::Indexer; -use ruff_python_parser::lexer::LexResult; use ruff_python_parser::AsMode; use ruff_source_file::Locator; use ruff_text_size::Ranged; @@ -76,7 +75,7 @@ pub(crate) fn check( let source_kind = SourceKind::Python(contents.to_string()); // Tokenize once. - let tokens: Vec = ruff_python_parser::tokenize(contents, source_type.as_mode()); + let tokens = ruff_python_parser::tokenize(contents, source_type.as_mode()); // Map row and column locations to byte slices (lazily). let locator = Locator::with_index(contents, index); diff --git a/crates/ruff_wasm/src/lib.rs b/crates/ruff_wasm/src/lib.rs index c9dd3603e9..56843a82e0 100644 --- a/crates/ruff_wasm/src/lib.rs +++ b/crates/ruff_wasm/src/lib.rs @@ -17,7 +17,6 @@ use ruff_python_ast::{Mod, PySourceType}; use ruff_python_codegen::Stylist; use ruff_python_formatter::{format_module_ast, pretty_comments, PyFormatContext, QuoteStyle}; use ruff_python_index::{CommentRangesBuilder, Indexer}; -use ruff_python_parser::lexer::LexResult; use ruff_python_parser::{parse_tokens, tokenize_all, AsMode, Mode, Program}; use ruff_python_trivia::CommentRanges; use ruff_source_file::{Locator, SourceLocation}; @@ -162,7 +161,7 @@ impl Workspace { let source_kind = SourceKind::Python(contents.to_string()); // Tokenize once. - let tokens: Vec = ruff_python_parser::tokenize(contents, source_type.as_mode()); + let tokens = ruff_python_parser::tokenize(contents, source_type.as_mode()); // Map row and column locations to byte slices (lazily). let locator = Locator::new(contents);