Improved regexp/strict rule (#225)

* Improved `regexp/strict` rule * Update * Disable validator for patterns with named backrefs
ota-meshi · Jun 4, 2021 · 637ee67 · 637ee67
1 parent 90d0ce1
commit 637ee67
Show file tree

Hide file tree

Showing 5 changed files with 449 additions and 30 deletions.
diff --git a/README.md b/README.md
@@ -111,7 +111,7 @@ The rules with the following star :star: are included in the `plugin:regexp/reco
 | [regexp/no-useless-assertions](https://ota-meshi.github.io/eslint-plugin-regexp/rules/no-useless-assertions.html) | disallow assertions that are known to always accept (or reject) |  |
 | [regexp/no-useless-backreference](https://ota-meshi.github.io/eslint-plugin-regexp/rules/no-useless-backreference.html) | disallow useless backreferences in regular expressions | :star: |
 | [regexp/no-useless-dollar-replacements](https://ota-meshi.github.io/eslint-plugin-regexp/rules/no-useless-dollar-replacements.html) | disallow useless `$` replacements in replacement string |  |
-| [regexp/strict](https://ota-meshi.github.io/eslint-plugin-regexp/rules/strict.html) | disallow not strictly valid regular expressions |  |
+| [regexp/strict](https://ota-meshi.github.io/eslint-plugin-regexp/rules/strict.html) | disallow not strictly valid regular expressions | :wrench: |
 
 ### Best Practices
 

diff --git a/docs/rules/README.md b/docs/rules/README.md
@@ -25,7 +25,7 @@ The rules with the following star :star: are included in the `plugin:regexp/reco
 | [regexp/no-useless-assertions](./no-useless-assertions.md) | disallow assertions that are known to always accept (or reject) |  |
 | [regexp/no-useless-backreference](./no-useless-backreference.md) | disallow useless backreferences in regular expressions | :star: |
 | [regexp/no-useless-dollar-replacements](./no-useless-dollar-replacements.md) | disallow useless `$` replacements in replacement string |  |
-| [regexp/strict](./strict.md) | disallow not strictly valid regular expressions |  |
+| [regexp/strict](./strict.md) | disallow not strictly valid regular expressions | :wrench: |
 
 ### Best Practices
 

diff --git a/docs/rules/strict.md b/docs/rules/strict.md
@@ -9,6 +9,7 @@ description: "disallow not strictly valid regular expressions"
 > disallow not strictly valid regular expressions
 
 - :exclamation: <badge text="This rule has not been released yet." vertical="middle" type="error"> ***This rule has not been released yet.*** </badge>
+- :wrench: The `--fix` option on the [command line](https://eslint.org/docs/user-guide/command-line-interface#fixing-problems) can automatically fix some of the problems reported by this rule.
 
 ## :book: Rule Details
 
@@ -20,7 +21,7 @@ Depending on the syntax defined in [Annex B] of the ECMAScript specification, so
 
 [Annex B]: https://tc39.es/ecma262/#sec-regular-expressions-patterns
 
-<eslint-code-block>
+<eslint-code-block fix>
 
 ```js
 /* eslint regexp/strict: "error" */

diff --git a/lib/rules/strict.ts b/lib/rules/strict.ts
@@ -1,15 +1,24 @@
 import { RegExpValidator } from "regexpp"
+import type { CharacterClassElement, Element } from "regexpp/ast"
 import type { RegExpVisitor } from "regexpp/visitor"
 import type { RegExpContext } from "../utils"
-import { createRule, defineRegexpVisitor } from "../utils"
+import {
+    isOctalEscape,
+    createRule,
+    defineRegexpVisitor,
+    isEscapeSequence,
+} from "../utils"
 
-const validator = new RegExpValidator({ strict: true })
+const validator = new RegExpValidator({ strict: true, ecmaVersion: 2020 })
 
 /**
  * Check syntax error in a given pattern.
- * @returns {string|null} The syntax error.
+ * @returns The syntax error.
  */
-function validateRegExpPattern(pattern: string, uFlag?: boolean) {
+function validateRegExpPattern(
+    pattern: string,
+    uFlag?: boolean,
+): string | null {
     try {
         validator.validatePattern(pattern, undefined, undefined, uFlag)
         return null
@@ -18,6 +27,9 @@ function validateRegExpPattern(pattern: string, uFlag?: boolean) {
     }
 }
 
+const CHARACTER_CLASS_SYNTAX_CHARACTERS = new Set("\\/()[]{}^$.|-+*?".split(""))
+const SYNTAX_CHARACTERS = new Set("\\/()[]{}^$.|+*?".split(""))
+
 export default createRule("strict", {
     meta: {
         docs: {
@@ -27,8 +39,33 @@ export default createRule("strict", {
             // recommended: true,
             recommended: false,
         },
+        fixable: "code",
         schema: [],
         messages: {
+            // character escape
+            invalidControlEscape:
+                "Invalid or incomplete control escape sequence. Either use a valid control escape sequence or escaping the standalone backslash.",
+            incompleteEscapeSequence:
+                "Incomplete escape sequence '{{expr}}'. Either use a valid escape sequence or remove the useless escaping.",
+            invalidPropertyEscape:
+                "Invalid property escape sequence '{{expr}}'. Either use a valid property escape sequence or remove the useless escaping.",
+            incompleteBackreference:
+                "Incomplete backreference '{{expr}}'. Either use a valid backreference or remove the useless escaping.",
+            unescapedSourceCharacter: "Unescaped source character '{{expr}}'.",
+            octalEscape:
+                "Invalid legacy octal escape sequence '{{expr}}'. Use a hexadecimal escape instead.",
+            uselessEscape:
+                "Useless identity escapes with non-syntax characters are forbidden.",
+
+            // character class
+            invalidRange:
+                "Invalid character class range. A character set cannot be the minimum or maximum of a character class range. Either escape the `-` or fix the character class range.",
+
+            // assertion
+            quantifiedAssertion:
+                "Assertion are not allowed to be quantified directly.",
+
+            // validator
             regexMessage: "{{message}}.",
         },
         type: "suggestion",
@@ -40,21 +77,195 @@ export default createRule("strict", {
         function createVisitor(
             regexpContext: RegExpContext,
         ): RegExpVisitor.Handlers {
-            const { node, flags, pattern } = regexpContext
+            const {
+                node,
+                flags,
+                pattern,
+                getRegexpLocation,
+                fixReplaceNode,
+            } = regexpContext
+
+            if (flags.unicode) {
+                // the Unicode flag enables strict parsing mode automatically
+                return {}
+            }
 
-            const message = validateRegExpPattern(pattern, flags.unicode)
+            let reported = false
+            let hasNamedBackreference = false
+
+            /** Report */
+            function report(
+                messageId: string,
+                element: Element,
+                fix?: string | null,
+            ): void {
+                reported = true
 
-            if (message) {
                 context.report({
                     node,
-                    messageId: "regexMessage",
+                    loc: getRegexpLocation(element),
+                    messageId,
                     data: {
-                        message,
+                        expr: element.raw,
                     },
+                    fix: fix ? fixReplaceNode(element, fix) : null,
                 })
             }
 
-            return {}
+            return {
+                // eslint-disable-next-line complexity -- x
+                onCharacterEnter(cNode) {
+                    if (cNode.raw === "\\") {
+                        // e.g. \c5 or \c
+                        report("invalidControlEscape", cNode)
+                        return
+                    }
+                    if (cNode.raw === "\\u" || cNode.raw === "\\x") {
+                        // e.g. \u000;
+                        report("incompleteEscapeSequence", cNode)
+                        return
+                    }
+                    if (cNode.raw === "\\p" || cNode.raw === "\\P") {
+                        // e.g. \p{H} or \p
+                        report("invalidPropertyEscape", cNode)
+                        return
+                    }
+                    if (cNode.value !== 0 && isOctalEscape(cNode.raw)) {
+                        // e.g. \023
+                        report(
+                            "octalEscape",
+                            cNode,
+                            `\\x${cNode.value.toString(16).padStart(2, "0")}`,
+                        )
+                        return
+                    }
+
+                    const insideCharClass =
+                        cNode.parent.type === "CharacterClass" ||
+                        cNode.parent.type === "CharacterClassRange"
+
+                    if (!insideCharClass) {
+                        if (cNode.raw === "\\k") {
+                            // e.g. \k<foo or \k
+                            report("incompleteBackreference", cNode)
+                            return
+                        }
+
+                        if (
+                            cNode.raw === "{" ||
+                            cNode.raw === "}" ||
+                            cNode.raw === "]"
+                        ) {
+                            report(
+                                "unescapedSourceCharacter",
+                                cNode,
+                                `\\${cNode.raw}`,
+                            )
+                            return
+                        }
+                    }
+
+                    if (isEscapeSequence(cNode.raw)) {
+                        // all remaining escape sequences are valid
+                        return
+                    }
+
+                    if (cNode.raw.startsWith("\\")) {
+                        const identity = cNode.raw.slice(1)
+                        const syntaxChars = insideCharClass
+                            ? CHARACTER_CLASS_SYNTAX_CHARACTERS
+                            : SYNTAX_CHARACTERS
+
+                        if (
+                            cNode.value === identity.charCodeAt(0) &&
+                            !syntaxChars.has(identity)
+                        ) {
+                            // e.g. \g or \;
+                            report("uselessEscape", cNode, identity)
+                        }
+                    }
+                },
+                onCharacterClassEnter(ccNode) {
+                    for (let i = 0; i < ccNode.elements.length; i++) {
+                        const current = ccNode.elements[i]
+
+                        if (current.type === "CharacterSet") {
+                            const next: CharacterClassElement | undefined =
+                                ccNode.elements[i + 1]
+                            const nextNext: CharacterClassElement | undefined =
+                                ccNode.elements[i + 2]
+
+                            if (next && next.raw === "-" && nextNext) {
+                                // e.g. [\w-a]
+                                report("invalidRange", current)
+                                return
+                            }
+
+                            const prev: CharacterClassElement | undefined =
+                                ccNode.elements[i - 1]
+                            const prevPrev: CharacterClassElement | undefined =
+                                ccNode.elements[i - 2]
+                            if (
+                                prev &&
+                                prev.raw === "-" &&
+                                prevPrev &&
+                                prevPrev.type !== "CharacterClassRange"
+                            ) {
+                                // e.g. [a-\w]
+                                report("invalidRange", current)
+                                return
+                            }
+                        }
+                    }
+                },
+                onQuantifierEnter(qNode) {
+                    if (qNode.element.type === "Assertion") {
+                        // e.g. \b+
+                        report(
+                            "quantifiedAssertion",
+                            qNode,
+                            `(?:${qNode.element.raw})${qNode.raw.slice(
+                                qNode.element.end - qNode.start,
+                            )}`,
+                        )
+                    }
+                },
+
+                onBackreferenceEnter(bNode) {
+                    if (typeof bNode.ref === "string") {
+                        hasNamedBackreference = true
+                    }
+                },
+                onPatternLeave() {
+                    if (hasNamedBackreference) {
+                        // There is a bug in regexpp that causes it throw a
+                        // syntax error for all non-Unicode regexes with named
+                        // backreferences.
+                        // TODO: Remove this workaround when the bug is fixed.
+                        return
+                    }
+
+                    if (!reported) {
+                        // our own logic couldn't find any problems,
+                        // so let's use a real parser to do the job.
+
+                        const message = validateRegExpPattern(
+                            pattern,
+                            flags.unicode,
+                        )
+
+                        if (message) {
+                            context.report({
+                                node,
+                                messageId: "regexMessage",
+                                data: {
+                                    message,
+                                },
+                            })
+                        }
+                    }
+                },
+            }
         }
 
         return defineRegexpVisitor(context, {