From 2922490cb8ad4fb0efee617dfe2251319bb1a4e4 Mon Sep 17 00:00:00 2001
From: Andrew Gallant <andrew@astral.sh>
Date: Wed, 8 Jan 2025 15:07:33 -0500
Subject: [PATCH] ruff_linter: fix handling of unprintable characters

Previously, we were replacing unprintable ASCII characters with a
printable representation of them via fancier Unicode characters. Since
`annotate-snippets` used to use codepoint offsets, this didn't make our
ranges incorrect: we swapped one codepoint for another.

But now, with the `annotate-snippets` upgrade, we use byte offsets
(which is IMO the correct choice). However, this means our ranges can be
thrown off since an ASCII codepoint is always one byte and a non-ASCII
codepoint is always more than one byte.

Instead of tweaking the `ShowNonprinting` trait and making it more
complicated (which is used in places other than this diagnostic
rendering it seems), we instead change `replace_whitespace` to handle
non-printable characters. This works out because `replace_whitespace`
was already updating the annotation range to account for the tab
replacement. We copy that approach for unprintable characters.
---
 crates/ruff_linter/src/message/text.rs | 67 ++++++++++++++++++--------
 1 file changed, 47 insertions(+), 20 deletions(-)

diff --git a/crates/ruff_linter/src/message/text.rs b/crates/ruff_linter/src/message/text.rs
index 98c7dddc36..ec2b97e23a 100644
--- a/crates/ruff_linter/src/message/text.rs
+++ b/crates/ruff_linter/src/message/text.rs
@@ -8,14 +8,13 @@ use ruff_annotate_snippets::{Level, Renderer, Snippet};
 
 use ruff_notebook::NotebookIndex;
 use ruff_source_file::{OneIndexed, SourceLocation};
-use ruff_text_size::{Ranged, TextRange, TextSize};
+use ruff_text_size::{Ranged, TextLen, TextRange, TextSize};
 
 use crate::fs::relativize_path;
 use crate::line_width::{IndentWidth, LineWidthBuilder};
 use crate::message::diff::Diff;
 use crate::message::{Emitter, EmitterContext, Message};
 use crate::settings::types::UnsafeFixes;
-use crate::text_helpers::ShowNonprinting;
 
 bitflags! {
     #[derive(Default)]
@@ -245,13 +244,11 @@ impl Display for MessageCodeFrame<'_> {
         let start_offset = source_code.line_start(start_index);
         let end_offset = source_code.line_end(end_index);
 
-        let source = replace_whitespace(
+        let source = replace_whitespace_and_unprintable(
             source_code.slice(TextRange::new(start_offset, end_offset)),
             self.message.range() - start_offset,
         );
 
-        let source_text = source.text.show_nonprinting();
-
         let label = self
             .message
             .rule()
@@ -270,7 +267,7 @@ impl Display for MessageCodeFrame<'_> {
         let span = usize::from(source.annotation_range.start())
             ..usize::from(source.annotation_range.end());
         let annotation = Level::Error.span(span).label(&label);
-        let snippet = Snippet::source(&source_text)
+        let snippet = Snippet::source(&source.text)
             .line_start(line_start)
             .annotation(annotation)
             .fold(false);
@@ -286,38 +283,68 @@ impl Display for MessageCodeFrame<'_> {
     }
 }
 
-fn replace_whitespace(source: &str, annotation_range: TextRange) -> SourceCode {
+/// Given some source code and an annotation range, this routine replaces
+/// tabs with ASCII whitespace, and unprintable characters with printable
+/// representations of them.
+///
+/// The source code returned has an annotation that is updated to reflect
+/// changes made to the source code (if any).
+fn replace_whitespace_and_unprintable(source: &str, annotation_range: TextRange) -> SourceCode {
     let mut result = String::new();
     let mut last_end = 0;
     let mut range = annotation_range;
     let mut line_width = LineWidthBuilder::new(IndentWidth::default());
 
+    // Updates the range given by the caller whenever a single byte (at
+    // `index` in `source`) is replaced with `len` bytes.
+    //
+    // When the index occurs before the start of the range, the range is
+    // offset by `len`. When the range occurs after or at the start but before
+    // the end, then the end of the range only is offset by `len`.
+    let mut update_range = |index, len| {
+        if index < usize::from(annotation_range.start()) {
+            range += TextSize::new(len - 1);
+        } else if index < usize::from(annotation_range.end()) {
+            range = range.add_end(TextSize::new(len - 1));
+        }
+    };
+
+    // If `c` is an unprintable character, then this returns a printable
+    // representation of it (using a fancier Unicode codepoint).
+    let unprintable_replacement = |c: char| -> Option<char> {
+        match c {
+            '\x07' => Some('␇'),
+            '\x08' => Some('␈'),
+            '\x1b' => Some('␛'),
+            '\x7f' => Some('␡'),
+            _ => None,
+        }
+    };
+
     for (index, c) in source.char_indices() {
         let old_width = line_width.get();
         line_width = line_width.add_char(c);
 
         if matches!(c, '\t') {
-            // SAFETY: The difference is a value in the range [1..TAB_SIZE] which is guaranteed to be less than `u32`.
-            #[allow(clippy::cast_possible_truncation)]
-            let tab_width = (line_width.get() - old_width) as u32;
-
-            if index < usize::from(annotation_range.start()) {
-                range += TextSize::new(tab_width - 1);
-            } else if index < usize::from(annotation_range.end()) {
-                range = range.add_end(TextSize::new(tab_width - 1));
-            }
-
+            let tab_width = u32::try_from(line_width.get() - old_width)
+                .expect("small width because of tab size");
             result.push_str(&source[last_end..index]);
-
             for _ in 0..tab_width {
                 result.push(' ');
             }
-
             last_end = index + 1;
+            update_range(index, tab_width);
+        } else if let Some(printable) = unprintable_replacement(c) {
+            result.push_str(&source[last_end..index]);
+            result.push(printable);
+            last_end = index + 1;
+
+            let len = printable.text_len().to_u32();
+            update_range(index, len);
         }
     }
 
-    // No tabs
+    // No tabs or unprintable chars
     if result.is_empty() {
         SourceCode {
             annotation_range,