[ruff] Extend unnecessary-regular-expression to non-literal strings (RUF055) (#14679)

Co-authored-by: Alex Waygood <alex.waygood@gmail.com>
This commit is contained in:
Brent Westbrook
2024-12-03 10:17:20 -05:00
committed by GitHub
parent 81bfcc9899
commit 62e358e929
7 changed files with 387 additions and 165 deletions

View File

@@ -74,3 +74,21 @@ re.sub(
"",
s, # string
)
# A diagnostic should not be emitted for `sub` replacements with backreferences or
# most other ASCII escapes
re.sub(r"a", r"\g<0>\g<0>\g<0>", "a")
re.sub(r"a", r"\1", "a")
re.sub(r"a", r"\s", "a")
# Escapes like \n are "processed":
# `re.sub(r"a", r"\n", some_string)` is fixed to `some_string.replace("a", "\n")`
# *not* `some_string.replace("a", "\\n")`.
# We currently emit diagnostics for some of these without fixing them.
re.sub(r"a", "\n", "a")
re.sub(r"a", r"\n", "a")
re.sub(r"a", "\a", "a")
re.sub(r"a", r"\a", "a")
re.sub(r"a", "\?", "a")
re.sub(r"a", r"\?", "a")

View File

@@ -0,0 +1,17 @@
"""Test that RUF055 can follow a single str assignment for both the pattern and
the replacement argument to re.sub
"""
import re
pat1 = "needle"
re.sub(pat1, "", haystack)
# aliases are not followed, so this one should not trigger the rule
if pat4 := pat1:
re.sub(pat4, "", haystack)
# also works for the `repl` argument in sub
repl = "new"
re.sub(r"abc", repl, haystack)

View File

@@ -411,7 +411,8 @@ mod tests {
#[test_case(Rule::MapIntVersionParsing, Path::new("RUF048_1.py"))]
#[test_case(Rule::UnrawRePattern, Path::new("RUF039.py"))]
#[test_case(Rule::UnrawRePattern, Path::new("RUF039_concat.py"))]
#[test_case(Rule::UnnecessaryRegularExpression, Path::new("RUF055.py"))]
#[test_case(Rule::UnnecessaryRegularExpression, Path::new("RUF055_0.py"))]
#[test_case(Rule::UnnecessaryRegularExpression, Path::new("RUF055_1.py"))]
fn preview_rules(rule_code: Rule, path: &Path) -> Result<()> {
let snapshot = format!(
"preview__{}_{}",

View File

@@ -1,8 +1,11 @@
use ruff_diagnostics::{AlwaysFixableViolation, Applicability, Diagnostic, Edit, Fix};
use itertools::Itertools;
use ruff_diagnostics::{Applicability, Diagnostic, Edit, Fix, FixAvailability, Violation};
use ruff_macros::{derive_message_formats, ViolationMetadata};
use ruff_python_ast::{
Arguments, CmpOp, Expr, ExprAttribute, ExprCall, ExprCompare, ExprContext, Identifier,
Arguments, CmpOp, Expr, ExprAttribute, ExprCall, ExprCompare, ExprContext, ExprStringLiteral,
Identifier,
};
use ruff_python_semantic::analyze::typing::find_binding_value;
use ruff_python_semantic::{Modules, SemanticModel};
use ruff_text_size::TextRange;
@@ -53,17 +56,19 @@ use crate::checkers::ast::Checker;
/// - [Python Regular Expression HOWTO: Common Problems - Use String Methods](https://docs.python.org/3/howto/regex.html#use-string-methods)
#[derive(ViolationMetadata)]
pub(crate) struct UnnecessaryRegularExpression {
replacement: String,
replacement: Option<String>,
}
impl AlwaysFixableViolation for UnnecessaryRegularExpression {
impl Violation for UnnecessaryRegularExpression {
const FIX_AVAILABILITY: FixAvailability = FixAvailability::Sometimes;
#[derive_message_formats]
fn message(&self) -> String {
"Plain string pattern passed to `re` function".to_string()
}
fn fix_title(&self) -> String {
format!("Replace with `{}`", self.replacement)
fn fix_title(&self) -> Option<String> {
Some(format!("Replace with `{}`", self.replacement.as_ref()?))
}
}
@@ -90,8 +95,8 @@ pub(crate) fn unnecessary_regular_expression(checker: &mut Checker, call: &ExprC
return;
};
// For now, restrict this rule to string literals
let Some(string_lit) = re_func.pattern.as_string_literal_expr() else {
// For now, restrict this rule to string literals and variables that can be resolved to literals
let Some(string_lit) = resolve_string_literal(re_func.pattern, semantic) else {
return;
};
@@ -110,33 +115,36 @@ pub(crate) fn unnecessary_regular_expression(checker: &mut Checker, call: &ExprC
// we can proceed with the str method replacement
let new_expr = re_func.replacement();
let repl = checker.generator().expr(&new_expr);
let diagnostic = Diagnostic::new(
let repl = new_expr.map(|expr| checker.generator().expr(&expr));
let mut diagnostic = Diagnostic::new(
UnnecessaryRegularExpression {
replacement: repl.clone(),
},
call.range,
);
let fix = Fix::applicable_edit(
Edit::range_replacement(repl, call.range),
if checker
.comment_ranges()
.has_comments(call, checker.source())
{
Applicability::Unsafe
} else {
Applicability::Safe
},
);
if let Some(repl) = repl {
diagnostic.set_fix(Fix::applicable_edit(
Edit::range_replacement(repl, call.range),
if checker
.comment_ranges()
.has_comments(call, checker.source())
{
Applicability::Unsafe
} else {
Applicability::Safe
},
));
}
checker.diagnostics.push(diagnostic.with_fix(fix));
checker.diagnostics.push(diagnostic);
}
/// The `re` functions supported by this rule.
#[derive(Debug)]
enum ReFuncKind<'a> {
Sub { repl: &'a Expr },
// Only `Some` if it's a fixable `re.sub()` call
Sub { repl: Option<&'a Expr> },
Match,
Search,
Fullmatch,
@@ -152,7 +160,7 @@ struct ReFunc<'a> {
impl<'a> ReFunc<'a> {
fn from_call_expr(
semantic: &SemanticModel,
semantic: &'a SemanticModel,
call: &'a ExprCall,
func_name: &str,
) -> Option<Self> {
@@ -173,11 +181,32 @@ impl<'a> ReFunc<'a> {
// version
("sub", 3) => {
let repl = call.arguments.find_argument("repl", 1)?;
if !repl.is_string_literal_expr() {
return None;
let lit = resolve_string_literal(repl, semantic)?;
let mut fixable = true;
for (c, next) in lit.value.chars().tuple_windows() {
// `\0` (or any other ASCII digit) and `\g` have special meaning in `repl` strings.
// Meanwhile, nearly all other escapes of ASCII letters in a `repl` string causes
// `re.PatternError` to be raised at runtime.
//
// If we see that the escaped character is an alphanumeric ASCII character,
// we should only emit a diagnostic suggesting to replace the `re.sub()` call with
// `str.replace`if we can detect that the escaped character is one that is both
// valid in a `repl` string *and* does not have any special meaning in a REPL string.
//
// It's out of scope for this rule to change invalid `re.sub()` calls into something
// that would not raise an exception at runtime. They should be left as-is.
if c == '\\' && next.is_ascii_alphanumeric() {
if "abfnrtv".contains(next) {
fixable = false;
} else {
return None;
}
}
}
Some(ReFunc {
kind: ReFuncKind::Sub { repl },
kind: ReFuncKind::Sub {
repl: fixable.then_some(repl),
},
pattern: call.arguments.find_argument("pattern", 0)?,
string: call.arguments.find_argument("string", 2)?,
})
@@ -201,20 +230,20 @@ impl<'a> ReFunc<'a> {
}
}
fn replacement(&self) -> Expr {
fn replacement(&self) -> Option<Expr> {
match self.kind {
// string.replace(pattern, repl)
ReFuncKind::Sub { repl } => {
self.method_expr("replace", vec![self.pattern.clone(), repl.clone()])
}
ReFuncKind::Sub { repl } => repl
.cloned()
.map(|repl| self.method_expr("replace", vec![self.pattern.clone(), repl])),
// string.startswith(pattern)
ReFuncKind::Match => self.method_expr("startswith", vec![self.pattern.clone()]),
ReFuncKind::Match => Some(self.method_expr("startswith", vec![self.pattern.clone()])),
// pattern in string
ReFuncKind::Search => self.compare_expr(CmpOp::In),
ReFuncKind::Search => Some(self.compare_expr(CmpOp::In)),
// string == pattern
ReFuncKind::Fullmatch => self.compare_expr(CmpOp::Eq),
ReFuncKind::Fullmatch => Some(self.compare_expr(CmpOp::Eq)),
// string.split(pattern)
ReFuncKind::Split => self.method_expr("split", vec![self.pattern.clone()]),
ReFuncKind::Split => Some(self.method_expr("split", vec![self.pattern.clone()])),
}
}
@@ -248,3 +277,23 @@ impl<'a> ReFunc<'a> {
})
}
}
/// Try to resolve `name` to an [`ExprStringLiteral`] in `semantic`.
fn resolve_string_literal<'a>(
name: &'a Expr,
semantic: &'a SemanticModel,
) -> Option<&'a ExprStringLiteral> {
if name.is_string_literal_expr() {
return name.as_string_literal_expr();
}
if let Some(name_expr) = name.as_name_expr() {
let binding = semantic.binding(semantic.only_binding(name_expr)?);
let value = find_binding_value(binding, semantic)?;
if value.is_string_literal_expr() {
return value.as_string_literal_expr();
}
}
None
}

View File

@@ -1,129 +0,0 @@
---
source: crates/ruff_linter/src/rules/ruff/mod.rs
snapshot_kind: text
---
RUF055.py:6:1: RUF055 [*] Plain string pattern passed to `re` function
|
5 | # this should be replaced with s.replace("abc", "")
6 | re.sub("abc", "", s)
| ^^^^^^^^^^^^^^^^^^^^ RUF055
|
= help: Replace with `s.replace("abc", "")`
Safe fix
3 3 | s = "str"
4 4 |
5 5 | # this should be replaced with s.replace("abc", "")
6 |-re.sub("abc", "", s)
6 |+s.replace("abc", "")
7 7 |
8 8 |
9 9 | # this example, adapted from https://docs.python.org/3/library/re.html#re.sub,
RUF055.py:22:4: RUF055 [*] Plain string pattern passed to `re` function
|
20 | # this one should be replaced with s.startswith("abc") because the Match is
21 | # used in an if context for its truth value
22 | if re.match("abc", s):
| ^^^^^^^^^^^^^^^^^^ RUF055
23 | pass
24 | if m := re.match("abc", s): # this should *not* be replaced
|
= help: Replace with `s.startswith("abc")`
Safe fix
19 19 |
20 20 | # this one should be replaced with s.startswith("abc") because the Match is
21 21 | # used in an if context for its truth value
22 |-if re.match("abc", s):
22 |+if s.startswith("abc"):
23 23 | pass
24 24 | if m := re.match("abc", s): # this should *not* be replaced
25 25 | pass
RUF055.py:29:4: RUF055 [*] Plain string pattern passed to `re` function
|
28 | # this should be replaced with "abc" in s
29 | if re.search("abc", s):
| ^^^^^^^^^^^^^^^^^^^ RUF055
30 | pass
31 | re.search("abc", s) # this should not be replaced
|
= help: Replace with `"abc" in s`
Safe fix
26 26 | re.match("abc", s) # this should not be replaced because match returns a Match
27 27 |
28 28 | # this should be replaced with "abc" in s
29 |-if re.search("abc", s):
29 |+if "abc" in s:
30 30 | pass
31 31 | re.search("abc", s) # this should not be replaced
32 32 |
RUF055.py:34:4: RUF055 [*] Plain string pattern passed to `re` function
|
33 | # this should be replaced with "abc" == s
34 | if re.fullmatch("abc", s):
| ^^^^^^^^^^^^^^^^^^^^^^ RUF055
35 | pass
36 | re.fullmatch("abc", s) # this should not be replaced
|
= help: Replace with `"abc" == s`
Safe fix
31 31 | re.search("abc", s) # this should not be replaced
32 32 |
33 33 | # this should be replaced with "abc" == s
34 |-if re.fullmatch("abc", s):
34 |+if "abc" == s:
35 35 | pass
36 36 | re.fullmatch("abc", s) # this should not be replaced
37 37 |
RUF055.py:39:1: RUF055 [*] Plain string pattern passed to `re` function
|
38 | # this should be replaced with s.split("abc")
39 | re.split("abc", s)
| ^^^^^^^^^^^^^^^^^^ RUF055
40 |
41 | # these currently should not be modified because the patterns contain regex
|
= help: Replace with `s.split("abc")`
Safe fix
36 36 | re.fullmatch("abc", s) # this should not be replaced
37 37 |
38 38 | # this should be replaced with s.split("abc")
39 |-re.split("abc", s)
39 |+s.split("abc")
40 40 |
41 41 | # these currently should not be modified because the patterns contain regex
42 42 | # metacharacters
RUF055.py:70:1: RUF055 [*] Plain string pattern passed to `re` function
|
69 | # this should trigger an unsafe fix because of the presence of comments
70 | / re.sub(
71 | | # pattern
72 | | "abc",
73 | | # repl
74 | | "",
75 | | s, # string
76 | | )
| |_^ RUF055
|
= help: Replace with `s.replace("abc", "")`
Unsafe fix
67 67 | re.split("abc", s, maxsplit=2)
68 68 |
69 69 | # this should trigger an unsafe fix because of the presence of comments
70 |-re.sub(
71 |- # pattern
72 |- "abc",
73 |- # repl
74 |- "",
75 |- s, # string
76 |-)
70 |+s.replace("abc", "")

View File

@@ -0,0 +1,227 @@
---
source: crates/ruff_linter/src/rules/ruff/mod.rs
---
RUF055_0.py:6:1: RUF055 [*] Plain string pattern passed to `re` function
|
5 | # this should be replaced with s.replace("abc", "")
6 | re.sub("abc", "", s)
| ^^^^^^^^^^^^^^^^^^^^ RUF055
|
= help: Replace with `s.replace("abc", "")`
Safe fix
3 3 | s = "str"
4 4 |
5 5 | # this should be replaced with s.replace("abc", "")
6 |-re.sub("abc", "", s)
6 |+s.replace("abc", "")
7 7 |
8 8 |
9 9 | # this example, adapted from https://docs.python.org/3/library/re.html#re.sub,
RUF055_0.py:22:4: RUF055 [*] Plain string pattern passed to `re` function
|
20 | # this one should be replaced with s.startswith("abc") because the Match is
21 | # used in an if context for its truth value
22 | if re.match("abc", s):
| ^^^^^^^^^^^^^^^^^^ RUF055
23 | pass
24 | if m := re.match("abc", s): # this should *not* be replaced
|
= help: Replace with `s.startswith("abc")`
Safe fix
19 19 |
20 20 | # this one should be replaced with s.startswith("abc") because the Match is
21 21 | # used in an if context for its truth value
22 |-if re.match("abc", s):
22 |+if s.startswith("abc"):
23 23 | pass
24 24 | if m := re.match("abc", s): # this should *not* be replaced
25 25 | pass
RUF055_0.py:29:4: RUF055 [*] Plain string pattern passed to `re` function
|
28 | # this should be replaced with "abc" in s
29 | if re.search("abc", s):
| ^^^^^^^^^^^^^^^^^^^ RUF055
30 | pass
31 | re.search("abc", s) # this should not be replaced
|
= help: Replace with `"abc" in s`
Safe fix
26 26 | re.match("abc", s) # this should not be replaced because match returns a Match
27 27 |
28 28 | # this should be replaced with "abc" in s
29 |-if re.search("abc", s):
29 |+if "abc" in s:
30 30 | pass
31 31 | re.search("abc", s) # this should not be replaced
32 32 |
RUF055_0.py:34:4: RUF055 [*] Plain string pattern passed to `re` function
|
33 | # this should be replaced with "abc" == s
34 | if re.fullmatch("abc", s):
| ^^^^^^^^^^^^^^^^^^^^^^ RUF055
35 | pass
36 | re.fullmatch("abc", s) # this should not be replaced
|
= help: Replace with `"abc" == s`
Safe fix
31 31 | re.search("abc", s) # this should not be replaced
32 32 |
33 33 | # this should be replaced with "abc" == s
34 |-if re.fullmatch("abc", s):
34 |+if "abc" == s:
35 35 | pass
36 36 | re.fullmatch("abc", s) # this should not be replaced
37 37 |
RUF055_0.py:39:1: RUF055 [*] Plain string pattern passed to `re` function
|
38 | # this should be replaced with s.split("abc")
39 | re.split("abc", s)
| ^^^^^^^^^^^^^^^^^^ RUF055
40 |
41 | # these currently should not be modified because the patterns contain regex
|
= help: Replace with `s.split("abc")`
Safe fix
36 36 | re.fullmatch("abc", s) # this should not be replaced
37 37 |
38 38 | # this should be replaced with s.split("abc")
39 |-re.split("abc", s)
39 |+s.split("abc")
40 40 |
41 41 | # these currently should not be modified because the patterns contain regex
42 42 | # metacharacters
RUF055_0.py:70:1: RUF055 [*] Plain string pattern passed to `re` function
|
69 | # this should trigger an unsafe fix because of the presence of comments
70 | / re.sub(
71 | | # pattern
72 | | "abc",
73 | | # repl
74 | | "",
75 | | s, # string
76 | | )
| |_^ RUF055
77 |
78 | # A diagnostic should not be emitted for `sub` replacements with backreferences or
|
= help: Replace with `s.replace("abc", "")`
Unsafe fix
67 67 | re.split("abc", s, maxsplit=2)
68 68 |
69 69 | # this should trigger an unsafe fix because of the presence of comments
70 |-re.sub(
71 |- # pattern
72 |- "abc",
73 |- # repl
74 |- "",
75 |- s, # string
76 |-)
70 |+s.replace("abc", "")
77 71 |
78 72 | # A diagnostic should not be emitted for `sub` replacements with backreferences or
79 73 | # most other ASCII escapes
RUF055_0.py:88:1: RUF055 [*] Plain string pattern passed to `re` function
|
86 | # *not* `some_string.replace("a", "\\n")`.
87 | # We currently emit diagnostics for some of these without fixing them.
88 | re.sub(r"a", "\n", "a")
| ^^^^^^^^^^^^^^^^^^^^^^^ RUF055
89 | re.sub(r"a", r"\n", "a")
90 | re.sub(r"a", "\a", "a")
|
= help: Replace with `"a".replace("a", "\n")`
Safe fix
85 85 | # `re.sub(r"a", r"\n", some_string)` is fixed to `some_string.replace("a", "\n")`
86 86 | # *not* `some_string.replace("a", "\\n")`.
87 87 | # We currently emit diagnostics for some of these without fixing them.
88 |-re.sub(r"a", "\n", "a")
88 |+"a".replace("a", "\n")
89 89 | re.sub(r"a", r"\n", "a")
90 90 | re.sub(r"a", "\a", "a")
91 91 | re.sub(r"a", r"\a", "a")
RUF055_0.py:89:1: RUF055 Plain string pattern passed to `re` function
|
87 | # We currently emit diagnostics for some of these without fixing them.
88 | re.sub(r"a", "\n", "a")
89 | re.sub(r"a", r"\n", "a")
| ^^^^^^^^^^^^^^^^^^^^^^^^ RUF055
90 | re.sub(r"a", "\a", "a")
91 | re.sub(r"a", r"\a", "a")
|
RUF055_0.py:90:1: RUF055 [*] Plain string pattern passed to `re` function
|
88 | re.sub(r"a", "\n", "a")
89 | re.sub(r"a", r"\n", "a")
90 | re.sub(r"a", "\a", "a")
| ^^^^^^^^^^^^^^^^^^^^^^^ RUF055
91 | re.sub(r"a", r"\a", "a")
|
= help: Replace with `"a".replace("a", "\x07")`
Safe fix
87 87 | # We currently emit diagnostics for some of these without fixing them.
88 88 | re.sub(r"a", "\n", "a")
89 89 | re.sub(r"a", r"\n", "a")
90 |-re.sub(r"a", "\a", "a")
90 |+"a".replace("a", "\x07")
91 91 | re.sub(r"a", r"\a", "a")
92 92 |
93 93 | re.sub(r"a", "\?", "a")
RUF055_0.py:91:1: RUF055 Plain string pattern passed to `re` function
|
89 | re.sub(r"a", r"\n", "a")
90 | re.sub(r"a", "\a", "a")
91 | re.sub(r"a", r"\a", "a")
| ^^^^^^^^^^^^^^^^^^^^^^^^ RUF055
92 |
93 | re.sub(r"a", "\?", "a")
|
RUF055_0.py:93:1: RUF055 [*] Plain string pattern passed to `re` function
|
91 | re.sub(r"a", r"\a", "a")
92 |
93 | re.sub(r"a", "\?", "a")
| ^^^^^^^^^^^^^^^^^^^^^^^ RUF055
94 | re.sub(r"a", r"\?", "a")
|
= help: Replace with `"a".replace("a", "\\?")`
Safe fix
90 90 | re.sub(r"a", "\a", "a")
91 91 | re.sub(r"a", r"\a", "a")
92 92 |
93 |-re.sub(r"a", "\?", "a")
93 |+"a".replace("a", "\\?")
94 94 | re.sub(r"a", r"\?", "a")
RUF055_0.py:94:1: RUF055 [*] Plain string pattern passed to `re` function
|
93 | re.sub(r"a", "\?", "a")
94 | re.sub(r"a", r"\?", "a")
| ^^^^^^^^^^^^^^^^^^^^^^^^ RUF055
|
= help: Replace with `"a".replace("a", "\\?")`
Safe fix
91 91 | re.sub(r"a", r"\a", "a")
92 92 |
93 93 | re.sub(r"a", "\?", "a")
94 |-re.sub(r"a", r"\?", "a")
94 |+"a".replace("a", "\\?")

View File

@@ -0,0 +1,39 @@
---
source: crates/ruff_linter/src/rules/ruff/mod.rs
---
RUF055_1.py:9:1: RUF055 [*] Plain string pattern passed to `re` function
|
7 | pat1 = "needle"
8 |
9 | re.sub(pat1, "", haystack)
| ^^^^^^^^^^^^^^^^^^^^^^^^^^ RUF055
10 |
11 | # aliases are not followed, so this one should not trigger the rule
|
= help: Replace with `haystack.replace(pat1, "")`
Safe fix
6 6 |
7 7 | pat1 = "needle"
8 8 |
9 |-re.sub(pat1, "", haystack)
9 |+haystack.replace(pat1, "")
10 10 |
11 11 | # aliases are not followed, so this one should not trigger the rule
12 12 | if pat4 := pat1:
RUF055_1.py:17:1: RUF055 [*] Plain string pattern passed to `re` function
|
15 | # also works for the `repl` argument in sub
16 | repl = "new"
17 | re.sub(r"abc", repl, haystack)
| ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ RUF055
|
= help: Replace with `haystack.replace("abc", repl)`
Safe fix
14 14 |
15 15 | # also works for the `repl` argument in sub
16 16 | repl = "new"
17 |-re.sub(r"abc", repl, haystack)
17 |+haystack.replace("abc", repl)