[format checker] check for anomalous backslash escape (new W1401, W1402). Closes #104571

authorSylvain Th?nault <sylvain.thenault@logilab.fr>
changesetb40e3b4bc006
branchdefault
phasepublic
hiddenno
parent revision#21a4414695c2 fix R0801 similarities bug. Closes #63424
child revision#b9dab5059b81 Closes #104572: symbolic warning names in output (by Martin Pool)
files modified by this revision
ChangeLog
README
checkers/__init__.py
checkers/format.py
test/input/func_excess_escapes.py
test/messages/func_excess_escapes.txt
# HG changeset patch
# User Sylvain Thénault <sylvain.thenault@logilab.fr>
# Date 1348063868 -7200
# Wed Sep 19 16:11:08 2012 +0200
# Node ID b40e3b4bc00692ccb503520cb689bc8d03d33cbd
# Parent 21a4414695c2ab14a6186e79abc82d4e35064de4
[format checker] check for anomalous backslash escape (new W1401, W1402). Closes #104571

diff --git a/ChangeLog b/ChangeLog
@@ -1,9 +1,12 @@
1  ChangeLog for PyLint
2  ====================
3 
4  --
5 +    * #104571: check for anomalous backslash escape, introducing new
6 +      W1401 and W1402 messages (patch by Martin Pool)
7 +
8      * #100707: check for boolop being used as exception class, introducing
9        new W0711 message (patch by Tim Hatch)
10 
11      * #4014: improve checking of metaclass methods first args, introducing
12        new C0204 message (patch by lothiraldan@gmail.com finalized by sthenault)
diff --git a/README b/README
@@ -63,8 +63,9 @@
13  * Benjamin Niemann: patch to allow block level enabling/disabling of messages
14  * Nathaniel Manista: suspicious lambda checking
15  * Wolfgang Grafen, Axel Muller, Fabio Zadrozny, Pierre Rouleau,
16    Maarten ter Huurne, Mirko Friedenhagen (among others):
17    bug reports, feedback, feature requests...
18 +* Martin Pool (Google): warnings for anomalous backslashes
19  * All the Logilab's team: daily use, bug reports, feature requests
20  * Other people have contributed by their feedback, if I've forgotten
21    you, send me a note !
diff --git a/checkers/__init__.py b/checkers/__init__.py
@@ -27,10 +27,11 @@
22  09: design_analysis
23  10: newstyle
24  11: typecheck
25  12: logging
26  13: string_format
27 +14: string_constant
28  14-50: not yet used: reserved for future internal checkers.
29  51-99: perhaps used: reserved for external checkers
30 
31  The raw_metrics checker has no number associated since it doesn't emit any
32  messages nor reports. XXX not true, emit a 07 report !
diff --git a/checkers/format.py b/checkers/format.py
@@ -1,7 +1,9 @@
33  # Copyright (c) 2003-2010 Sylvain Thenault (thenault@gmail.com).
34  # Copyright (c) 2003-2012 LOGILAB S.A. (Paris, FRANCE).
35 +# Copyright 2012 Google Inc.
36 +#
37  # This program is free software; you can redistribute it and/or modify it under
38  # the terms of the GNU General Public License as published by the Free Software
39  # Foundation; either version 2 of the License, or (at your option) any later
40  # version.
41  #
@@ -50,14 +52,14 @@
42                isn\'t necessary (that\'s python, not C ;).'),
43      'C0321': ('More than one statement on a single line',
44                'Used when more than on statement are found on the same line.'),
45      'C0322': ('Operator not preceded by a space\n%s',
46                'Used when one of the following operator (!= | <= | == | >= | < '
47 -              '| > | = | \+= | -= | \*= | /= | %) is not preceded by a space.'),
48 +              '| > | = | \\+= | -= | \\*= | /= | %) is not preceded by a space.'),
49      'C0323': ('Operator not followed by a space\n%s',
50                'Used when one of the following operator (!= | <= | == | >= | < '
51 -              '| > | = | \+= | -= | \*= | /= | %) is not followed by a space.'),
52 +              '| > | = | \\+= | -= | \\*= | /= | %) is not followed by a space.'),
53      'C0324': ('Comma not followed by a space\n%s',
54                'Used when a comma (",") is not followed by a space.'),
55      }
56 
57  if sys.version_info < (3, 0):
@@ -79,11 +81,11 @@
58  SQSTRING_RGX = r'"([^"\\]|\\.)*?"'
59  # simple apostrophed rgx
60  SASTRING_RGX = r"'([^'\\]|\\.)*?'"
61  # triple quoted string rgx
62  TQSTRING_RGX = r'"""([^"]|("(?!"")))*?(""")'
63 -# triple apostrophed string rgx # FIXME english please
64 +# triple apostrophe'd string rgx
65  TASTRING_RGX = r"'''([^']|('(?!'')))*?(''')"
66 
67  # finally, the string regular expression
68  STRING_RGX = re.compile('(%s)|(%s)|(%s)|(%s)' % (TQSTRING_RGX, TASTRING_RGX,
69                                                   SQSTRING_RGX, SASTRING_RGX),
@@ -107,15 +109,16 @@
70 
71      (re.compile(OP_RGX_MATCH_2, re.M),
72       re.compile(OP_RGX_SEARCH_2, re.M),
73       'C0323'),
74 
75 -    (re.compile(r'.*,[^(\s|\]|}|\))].*', re.M), 
76 +    (re.compile(r'.*,[^(\s|\]|}|\))].*', re.M),
77       re.compile(r',[^\s)]', re.M),
78       'C0324'),
79      )
80 
81 +_PY3K = sys.version_info >= (3, 0)
82 
83  def get_string_coords(line):
84      """return a list of string positions (tuple (start, end)) in the line
85      """
86      result = []
@@ -354,8 +357,103 @@
87              self.add_message('W0311', line=line_num,
88                               args=(level * unit_size + len(suppl), i_type,
89                                     expected * unit_size))
90 
91 
92 +class StringConstantChecker(BaseRawChecker):
93 +    """Check string literals"""
94 +
95 +    msgs = {
96 +        'W1401': ('Anomalous backslash in string: \'%s\'. '
97 +                  'String constant might be missing an r prefix.',
98 +                  'Used when a backslash is in a literal string but not as an '
99 +                  'escape.'),
100 +        'W1402': ('Anomalous Unicode escape in byte string: \'%s\'. '
101 +                  'String constant might be missing an r or u prefix.',
102 +                  'Used when an escape like \\u is encountered in a byte '
103 +                  'string where it has no effect.'),
104 +        }
105 +    name = 'string_constant'
106 +    __implements__ = (IRawChecker, IASTNGChecker)
107 +
108 +    # Characters that have a special meaning after a backslash in either
109 +    # Unicode or byte strings.
110 +    ESCAPE_CHARACTERS = 'abfnrtvox\n\r\t\\\'\"'
111 +
112 +    # Characters that have a special meaning after a backslash but only in
113 +    # Unicode strings.
114 +    UNICODE_ESCAPE_CHARACTERS = 'uUN'
115 +
116 +    def process_tokens(self, tokens):
117 +        for (tok_type, token, (start_row, start_col), _, _) in tokens:
118 +            if tok_type == tokenize.STRING:
119 +                # 'token' is the whole un-parsed token; we can look at the start
120 +                # of it to see whether it's a raw or unicode string etc.
121 +                self.process_string_token(token, start_row, start_col)
122 +
123 +    def process_string_token(self, token, start_row, start_col):
124 +        for i, c in enumerate(token):
125 +            if c in '\'\"':
126 +                quote_char = c
127 +                break
128 +        prefix = token[:i].lower()  #  markers like u, b, r.
129 +        after_prefix = token[i:]
130 +        if after_prefix[:3] == after_prefix[-3:] == 3 * quote_char:
131 +            string_body = after_prefix[3:-3]
132 +        else:
133 +            string_body = after_prefix[1:-1]  # Chop off quotes
134 +        # No special checks on raw strings at the moment.
135 +        if 'r' not in prefix:
136 +            self.process_non_raw_string_token(prefix, string_body,
137 +                start_row, start_col)
138 +
139 +    def process_non_raw_string_token(self, prefix, string_body, start_row,
140 +        start_col):
141 +        """check for bad escapes in a non-raw string.
142 +
143 +        prefix: lowercase string of eg 'ur' string prefix markers.
144 +        string_body: the un-parsed body of the string, not including the quote
145 +        marks.
146 +        start_row: integer line number in the source.
147 +        start_col: integer column number in the source.
148 +        """
149 +        # Walk through the string; if we see a backslash then escape the next
150 +        # character, and skip over it.  If we see a non-escaped character,
151 +        # alert, and continue.
152 +        #
153 +        # Accept a backslash when it escapes a backslash, or a quote, or
154 +        # end-of-line, or one of the letters that introduce a special escape
155 +        # sequence <http://docs.python.org/reference/lexical_analysis.html>
156 +        #
157 +        # TODO(mbp): Maybe give a separate warning about the rarely-used
158 +        # \a \b \v \f?
159 +        #
160 +        # TODO(mbp): We could give the column of the problem character, but
161 +        # add_message doesn't seem to have a way to pass it through at present.
162 +        i = 0
163 +        while True:
164 +            i = string_body.find('\\', i)
165 +            if i == -1:
166 +                break
167 +            # There must be a next character; having a backslash at the end
168 +            # of the string would be a SyntaxError.
169 +            next_char = string_body[i+1]
170 +            match = string_body[i:i+2]
171 +            if next_char in self.UNICODE_ESCAPE_CHARACTERS:
172 +                if 'u' in prefix:
173 +                    pass
174 +                elif _PY3K and 'b' not in prefix:
175 +                    pass  # unicode by default
176 +                else:
177 +                    self.add_message('W1402', line=start_row, args=(match, ))
178 +            elif next_char not in self.ESCAPE_CHARACTERS:
179 +                self.add_message('W1401', line=start_row, args=(match, ))
180 +            # Whether it was a valid escape or not, backslash followed by
181 +            # another character can always be consumed whole: the second
182 +            # character can never be the start of a new backslash escape.
183 +            i += 2
184 +
185 +
186  def register(linter):
187      """required method to auto register this checker """
188      linter.register_checker(FormatChecker(linter))
189 +    linter.register_checker(StringConstantChecker(linter))
diff --git a/test/input/func_excess_escapes.py b/test/input/func_excess_escapes.py
@@ -0,0 +1,44 @@
190 +# pylint:disable=W0105, W0511
191 +"""Stray backslash escapes may be missing a raw-string prefix."""
192 +
193 +__revision__ = '$Id$'
194 +
195 +# Bad escape sequences, which probably don't do what you expect.
196 +A = "\[\]\\"
197 +assert '\/' == '\\/'
198 +ESCAPE_BACKSLASH = '\`'
199 +
200 +# Valid escape sequences.
201 +NEWLINE = "\n"
202 +OLD_ESCAPES = '\a\b\f\n\t\r\v'
203 +HEX = '\xad\x0a\x0d'
204 +OCTAL = '\o123\o000'
205 +UNICODE = u'\u1234'
206 +HIGH_UNICODE = u'\U0000abcd'
207 +QUOTES = '\'\"'
208 +LITERAL_NEWLINE = '\
209 +'
210 +ESCAPE_UNICODE = "\\\\n"
211 +
212 +# Bad docstring
213 +"""Even in a docstring
214 +
215 +You shouldn't have ambiguous text like: C:\Program Files\alpha
216 +"""
217 +
218 +# Would be valid in Unicode, but probably not what you want otherwise
219 +BAD_UNICODE = '\u0042'
220 +BAD_LONG_UNICODE = '\U00000042'
221 +BAD_NAMED_UNICODE = '\N{GREEK SMALL LETTER ALPHA}'
222 +
223 +GOOD_UNICODE = u'\u0042'
224 +GOOD_LONG_UNICODE = u'\U00000042'
225 +GOOD_NAMED_UNICODE = u'\N{GREEK SMALL LETTER ALPHA}'
226 +
227 +
228 +# Valid raw strings
229 +RAW_BACKSLASHES = r'raw'
230 +RAW_UNICODE = ur"\u0062\n"
231 +
232 +# In a comment you can have whatever you want: \ \\ \n \m
233 +# even things that look like bad strings: "C:\Program Files"
diff --git a/test/messages/func_excess_escapes.txt b/test/messages/func_excess_escapes.txt
@@ -0,0 +1,8 @@
234 +W:  7: Anomalous backslash in string: '\['. String constant might be missing an r prefix.
235 +W:  7: Anomalous backslash in string: '\]'. String constant might be missing an r prefix.
236 +W:  8: Anomalous backslash in string: '\/'. String constant might be missing an r prefix.
237 +W:  9: Anomalous backslash in string: '\`'. String constant might be missing an r prefix.
238 +W: 24: Anomalous backslash in string: '\P'. String constant might be missing an r prefix.
239 +W: 30: Anomalous Unicode escape in byte string: '\u'. String constant might be missing an r or u prefix.
240 +W: 31: Anomalous Unicode escape in byte string: '\U'. String constant might be missing an r or u prefix.
241 +W: 32: Anomalous Unicode escape in byte string: '\N'. String constant might be missing an r or u prefix.