From 2922490cb8ad4fb0efee617dfe2251319bb1a4e4 Mon Sep 17 00:00:00 2001 From: Andrew Gallant Date: Wed, 8 Jan 2025 15:07:33 -0500 Subject: [PATCH] ruff_linter: fix handling of unprintable characters Previously, we were replacing unprintable ASCII characters with a printable representation of them via fancier Unicode characters. Since `annotate-snippets` used to use codepoint offsets, this didn't make our ranges incorrect: we swapped one codepoint for another. But now, with the `annotate-snippets` upgrade, we use byte offsets (which is IMO the correct choice). However, this means our ranges can be thrown off since an ASCII codepoint is always one byte and a non-ASCII codepoint is always more than one byte. Instead of tweaking the `ShowNonprinting` trait and making it more complicated (which is used in places other than this diagnostic rendering it seems), we instead change `replace_whitespace` to handle non-printable characters. This works out because `replace_whitespace` was already updating the annotation range to account for the tab replacement. We copy that approach for unprintable characters. --- crates/ruff_linter/src/message/text.rs | 67 ++++++++++++++++++-------- 1 file changed, 47 insertions(+), 20 deletions(-) diff --git a/crates/ruff_linter/src/message/text.rs b/crates/ruff_linter/src/message/text.rs index 98c7dddc36..ec2b97e23a 100644 --- a/crates/ruff_linter/src/message/text.rs +++ b/crates/ruff_linter/src/message/text.rs @@ -8,14 +8,13 @@ use ruff_annotate_snippets::{Level, Renderer, Snippet}; use ruff_notebook::NotebookIndex; use ruff_source_file::{OneIndexed, SourceLocation}; -use ruff_text_size::{Ranged, TextRange, TextSize}; +use ruff_text_size::{Ranged, TextLen, TextRange, TextSize}; use crate::fs::relativize_path; use crate::line_width::{IndentWidth, LineWidthBuilder}; use crate::message::diff::Diff; use crate::message::{Emitter, EmitterContext, Message}; use crate::settings::types::UnsafeFixes; -use crate::text_helpers::ShowNonprinting; bitflags! { #[derive(Default)] @@ -245,13 +244,11 @@ impl Display for MessageCodeFrame<'_> { let start_offset = source_code.line_start(start_index); let end_offset = source_code.line_end(end_index); - let source = replace_whitespace( + let source = replace_whitespace_and_unprintable( source_code.slice(TextRange::new(start_offset, end_offset)), self.message.range() - start_offset, ); - let source_text = source.text.show_nonprinting(); - let label = self .message .rule() @@ -270,7 +267,7 @@ impl Display for MessageCodeFrame<'_> { let span = usize::from(source.annotation_range.start()) ..usize::from(source.annotation_range.end()); let annotation = Level::Error.span(span).label(&label); - let snippet = Snippet::source(&source_text) + let snippet = Snippet::source(&source.text) .line_start(line_start) .annotation(annotation) .fold(false); @@ -286,38 +283,68 @@ impl Display for MessageCodeFrame<'_> { } } -fn replace_whitespace(source: &str, annotation_range: TextRange) -> SourceCode { +/// Given some source code and an annotation range, this routine replaces +/// tabs with ASCII whitespace, and unprintable characters with printable +/// representations of them. +/// +/// The source code returned has an annotation that is updated to reflect +/// changes made to the source code (if any). +fn replace_whitespace_and_unprintable(source: &str, annotation_range: TextRange) -> SourceCode { let mut result = String::new(); let mut last_end = 0; let mut range = annotation_range; let mut line_width = LineWidthBuilder::new(IndentWidth::default()); + // Updates the range given by the caller whenever a single byte (at + // `index` in `source`) is replaced with `len` bytes. + // + // When the index occurs before the start of the range, the range is + // offset by `len`. When the range occurs after or at the start but before + // the end, then the end of the range only is offset by `len`. + let mut update_range = |index, len| { + if index < usize::from(annotation_range.start()) { + range += TextSize::new(len - 1); + } else if index < usize::from(annotation_range.end()) { + range = range.add_end(TextSize::new(len - 1)); + } + }; + + // If `c` is an unprintable character, then this returns a printable + // representation of it (using a fancier Unicode codepoint). + let unprintable_replacement = |c: char| -> Option { + match c { + '\x07' => Some('␇'), + '\x08' => Some('␈'), + '\x1b' => Some('␛'), + '\x7f' => Some('␡'), + _ => None, + } + }; + for (index, c) in source.char_indices() { let old_width = line_width.get(); line_width = line_width.add_char(c); if matches!(c, '\t') { - // SAFETY: The difference is a value in the range [1..TAB_SIZE] which is guaranteed to be less than `u32`. - #[allow(clippy::cast_possible_truncation)] - let tab_width = (line_width.get() - old_width) as u32; - - if index < usize::from(annotation_range.start()) { - range += TextSize::new(tab_width - 1); - } else if index < usize::from(annotation_range.end()) { - range = range.add_end(TextSize::new(tab_width - 1)); - } - + let tab_width = u32::try_from(line_width.get() - old_width) + .expect("small width because of tab size"); result.push_str(&source[last_end..index]); - for _ in 0..tab_width { result.push(' '); } - last_end = index + 1; + update_range(index, tab_width); + } else if let Some(printable) = unprintable_replacement(c) { + result.push_str(&source[last_end..index]); + result.push(printable); + last_end = index + 1; + + let len = printable.text_len().to_u32(); + update_range(index, len); } } - // No tabs + // No tabs or unprintable chars if result.is_empty() { SourceCode { annotation_range,