805 lines
26 KiB
JavaScript
805 lines
26 KiB
JavaScript
// @ts-check
|
|
'use strict';
|
|
|
|
const { assert } = require('chai');
|
|
const PrismLoader = require('./helper/prism-loader');
|
|
const TestDiscovery = require('./helper/test-discovery');
|
|
const TestCase = require('./helper/test-case');
|
|
const { BFS, parseRegex } = require('./helper/util');
|
|
const { languages } = require('../components.json');
|
|
const { visitRegExpAST } = require('regexpp');
|
|
const { transform, combineTransformers, getIntersectionWordSets, JS, Words, NFA, Transformers } = require('refa');
|
|
const scslre = require('scslre');
|
|
const { argv } = require('yargs');
|
|
const RAA = require('regexp-ast-analysis');
|
|
|
|
/**
|
|
* A map from language id to a list of code snippets in that language.
|
|
*
|
|
* @type {Map<string, string[]>}
|
|
*/
|
|
const testSnippets = new Map();
|
|
const testSuite = TestDiscovery.loadAllTests(__dirname + '/languages');
|
|
for (const languageIdentifier in testSuite) {
|
|
const lang = TestCase.parseLanguageNames(languageIdentifier).mainLanguage;
|
|
let snippets = testSnippets.get(lang);
|
|
if (snippets === undefined) {
|
|
snippets = [];
|
|
testSnippets.set(lang, snippets);
|
|
}
|
|
|
|
for (const file of testSuite[languageIdentifier]) {
|
|
snippets.push(TestCase.TestCaseFile.readFromFile(file).code);
|
|
}
|
|
}
|
|
|
|
|
|
for (const lang in languages) {
|
|
if (lang === 'meta' || (!!argv.language && lang !== argv.language)) {
|
|
continue;
|
|
}
|
|
|
|
describe(`Patterns of '${lang}'`, function () {
|
|
const Prism = PrismLoader.createInstance(lang);
|
|
testPatterns(Prism, lang);
|
|
});
|
|
|
|
let optional = toArray(languages[lang].optional);
|
|
let modify = toArray(languages[lang].modify);
|
|
|
|
if (optional.length > 0 || modify.length > 0) {
|
|
let name = `Patterns of '${lang}'`;
|
|
if (optional.length > 0) {
|
|
name += ` + optional dependencies '${optional.join("', '")}'`;
|
|
}
|
|
if (modify.length > 0) {
|
|
name += ` + modify dependencies '${modify.join("', '")}'`;
|
|
}
|
|
|
|
describe(name, function () {
|
|
const Prism = PrismLoader.createInstance([...optional, ...modify, lang]);
|
|
testPatterns(Prism, lang);
|
|
});
|
|
}
|
|
}
|
|
|
|
/**
|
|
* Tests all patterns in the given Prism instance.
|
|
*
|
|
* @param {any} Prism
|
|
* @param {string} mainLanguage
|
|
*
|
|
* @typedef {import("./helper/util").LiteralAST} LiteralAST
|
|
* @typedef {import("regexpp/ast").CapturingGroup} CapturingGroup
|
|
* @typedef {import("regexpp/ast").Element} Element
|
|
* @typedef {import("regexpp/ast").Group} Group
|
|
* @typedef {import("regexpp/ast").LookaroundAssertion} LookaroundAssertion
|
|
* @typedef {import("regexpp/ast").Pattern} Pattern
|
|
*/
|
|
function testPatterns(Prism, mainLanguage) {
|
|
|
|
/**
|
|
* Returns a list of relevant languages in the Prism instance.
|
|
*
|
|
* The list does not included readonly dependencies and aliases.
|
|
*
|
|
* @returns {string[]}
|
|
*/
|
|
function getRelevantLanguages() {
|
|
return [mainLanguage, ...toArray(languages[mainLanguage].modify)]
|
|
.filter(lang => lang in Prism.languages);
|
|
}
|
|
|
|
/**
|
|
* @param {string} root
|
|
* @param {Parameters<Parameters<typeof BFS>[1]>[0]} path
|
|
* @returns {string}
|
|
*/
|
|
function BFSPathToString(root, path) {
|
|
let pathStr = root;
|
|
for (const { key } of path) {
|
|
if (!key) {
|
|
// do nothing
|
|
} else if (/^\d+$/.test(key)) {
|
|
pathStr += `[${key}]`;
|
|
} else if (/^[a-z]\w*$/i.test(key)) {
|
|
pathStr += `.${key}`;
|
|
} else {
|
|
pathStr += `[${JSON.stringify(key)}]`;
|
|
}
|
|
}
|
|
return pathStr;
|
|
}
|
|
|
|
/**
|
|
* Invokes the given function on every pattern in `Prism.languages`.
|
|
*
|
|
* _Note:_ This will aggregate all errors thrown by the given callback and throw an aggregated error at the end
|
|
* of the iteration. You can also append any number of errors per callback using the `reportError` function.
|
|
*
|
|
* @param {(values: ForEachPatternCallbackValue) => void} callback
|
|
*
|
|
* @typedef ForEachPatternCallbackValue
|
|
* @property {RegExp} pattern
|
|
* @property {LiteralAST} ast
|
|
* @property {string} tokenPath
|
|
* @property {string} name
|
|
* @property {any} parent
|
|
* @property {boolean} lookbehind Whether the first capturing group of the pattern is a Prism lookbehind group.
|
|
* @property {CapturingGroup | undefined} lookbehindGroup
|
|
* @property {{ key: string, value: any }[]} path
|
|
* @property {(message: string) => void} reportError
|
|
*/
|
|
function forEachPattern(callback) {
|
|
const visited = new Set();
|
|
const errors = [];
|
|
|
|
/**
|
|
* @param {object} root
|
|
* @param {string} rootStr
|
|
*/
|
|
function traverse(root, rootStr) {
|
|
if (visited.has(root)) {
|
|
return;
|
|
}
|
|
visited.add(root);
|
|
|
|
BFS(root, path => {
|
|
const { key, value } = path[path.length - 1];
|
|
visited.add(value);
|
|
|
|
const tokenPath = BFSPathToString(rootStr, path);
|
|
|
|
if (Object.prototype.toString.call(value) == '[object RegExp]') {
|
|
try {
|
|
let ast;
|
|
try {
|
|
ast = parseRegex(value);
|
|
} catch (error) {
|
|
throw new SyntaxError(`Invalid RegExp at ${tokenPath}\n\n${error.message}`);
|
|
}
|
|
|
|
const parent = path.length > 1 ? path[path.length - 2].value : undefined;
|
|
const lookbehind = key === 'pattern' && parent && !!parent.lookbehind;
|
|
const lookbehindGroup = lookbehind ? getFirstCapturingGroup(ast.pattern) : undefined;
|
|
callback({
|
|
pattern: value,
|
|
ast,
|
|
tokenPath,
|
|
name: key,
|
|
parent,
|
|
path,
|
|
lookbehind,
|
|
lookbehindGroup,
|
|
reportError: message => errors.push(message)
|
|
});
|
|
} catch (error) {
|
|
errors.push(error);
|
|
}
|
|
}
|
|
});
|
|
}
|
|
|
|
// static analysis
|
|
traverse(Prism.languages, 'Prism.languages');
|
|
|
|
// dynamic analysis
|
|
for (const lang of getRelevantLanguages()) {
|
|
const snippets = testSnippets.get(lang);
|
|
const grammar = Prism.languages[lang];
|
|
|
|
const oldTokenize = Prism.tokenize;
|
|
Prism.tokenize = function (_, grammar) {
|
|
const result = oldTokenize.apply(this, arguments);
|
|
traverse(grammar, lang + ': <Unknown>');
|
|
return result;
|
|
};
|
|
|
|
for (const snippet of (snippets || [])) {
|
|
Prism.highlight(snippet, grammar, lang);
|
|
}
|
|
|
|
Prism.tokenize = oldTokenize;
|
|
}
|
|
|
|
if (errors.length > 0) {
|
|
throw new Error(errors.map(e => String(e.message || e)).join('\n\n'));
|
|
}
|
|
}
|
|
|
|
/**
|
|
* Invokes the given callback for all capturing groups in the given pattern in left to right order.
|
|
*
|
|
* @param {Pattern} pattern
|
|
* @param {(values: ForEachCapturingGroupCallbackValue) => void} callback
|
|
*
|
|
* @typedef ForEachCapturingGroupCallbackValue
|
|
* @property {CapturingGroup} group
|
|
* @property {number} number Note: Starts at 1.
|
|
*/
|
|
function forEachCapturingGroup(pattern, callback) {
|
|
let number = 0;
|
|
visitRegExpAST(pattern, {
|
|
onCapturingGroupEnter(node) {
|
|
callback({
|
|
group: node,
|
|
number: ++number
|
|
});
|
|
}
|
|
});
|
|
}
|
|
|
|
|
|
it('- should not match the empty string', function () {
|
|
forEachPattern(({ ast, pattern, tokenPath }) => {
|
|
// test for empty string
|
|
const empty = RAA.isPotentiallyZeroLength(ast.pattern.alternatives);
|
|
assert.isFalse(empty, `${tokenPath}: ${pattern} should not match the empty string.\n\n`
|
|
+ `Patterns that do match the empty string can potentially cause infinitely many empty tokens. `
|
|
+ `Make sure that all patterns always consume at least one character.`);
|
|
});
|
|
});
|
|
|
|
it('- should have a capturing group if lookbehind is set to true', function () {
|
|
forEachPattern(({ ast, tokenPath, lookbehind }) => {
|
|
if (lookbehind) {
|
|
let hasCapturingGroup = false;
|
|
forEachCapturingGroup(ast.pattern, () => { hasCapturingGroup = true; });
|
|
|
|
if (!hasCapturingGroup) {
|
|
assert.fail(`${tokenPath}: The pattern is set to 'lookbehind: true' but does not have a capturing group.\n\n`
|
|
+ `Prism lookbehind groups use the captured text of the first capturing group to simulate a lookbehind. `
|
|
+ `Without a capturing group, a lookbehind is not possible.\n`
|
|
+ `To fix this, either add a capturing group for the lookbehind or remove the 'lookbehind' property.`);
|
|
}
|
|
}
|
|
});
|
|
});
|
|
|
|
it('- should not have lookbehind groups that can be preceded by other some characters', function () {
|
|
forEachPattern(({ tokenPath, lookbehindGroup }) => {
|
|
if (lookbehindGroup && !isFirstMatch(lookbehindGroup)) {
|
|
assert.fail(`${tokenPath}: The lookbehind group ${lookbehindGroup.raw} might be preceded by some characters.\n\n`
|
|
+ `Prism assumes that the lookbehind group, if captured, is the first thing matched by the regex. `
|
|
+ `If characters might precede the lookbehind group (e.g. /a?(b)c/), then Prism cannot correctly apply the lookbehind correctly in all cases.\n`
|
|
+ `To fix this, either remove the preceding characters or include them in the lookbehind group.`);
|
|
}
|
|
});
|
|
});
|
|
|
|
it('- should not have lookbehind groups that only have zero-width alternatives', function () {
|
|
forEachPattern(({ tokenPath, lookbehindGroup, reportError }) => {
|
|
if (lookbehindGroup && RAA.isZeroLength(lookbehindGroup)) {
|
|
const groupContent = lookbehindGroup.raw.substr(1, lookbehindGroup.raw.length - 2);
|
|
const replacement = lookbehindGroup.alternatives.length === 1 ? groupContent : `(?:${groupContent})`;
|
|
reportError(`${tokenPath}: The lookbehind group ${lookbehindGroup.raw} does not consume characters.\n\n`
|
|
+ `Therefor it is not necessary to use a lookbehind group.\n`
|
|
+ `To fix this, replace the lookbehind group with ${replacement} and remove the 'lookbehind' property.`);
|
|
}
|
|
});
|
|
});
|
|
|
|
it('- should not have unused capturing groups', function () {
|
|
forEachPattern(({ ast, tokenPath, lookbehindGroup, reportError }) => {
|
|
forEachCapturingGroup(ast.pattern, ({ group, number }) => {
|
|
const isLookbehindGroup = group === lookbehindGroup;
|
|
if (group.references.length === 0 && !isLookbehindGroup) {
|
|
const fixes = [];
|
|
fixes.push(`Make this group a non-capturing group ('(?:...)' instead of '(...)'). (It's usually this option.)`);
|
|
fixes.push(`Reference this group with a backreference (use '\\${number}' for this).`);
|
|
if (number === 1 && !lookbehindGroup) {
|
|
if (isFirstMatch(group)) {
|
|
fixes.push(`Add a 'lookbehind: true' declaration.`);
|
|
} else {
|
|
fixes.push(`Add a 'lookbehind: true' declaration. (This group is not a valid lookbehind group because it can be preceded by some characters.)`);
|
|
}
|
|
}
|
|
|
|
reportError(`${tokenPath}: Unused capturing group ${group.raw}.\n\n`
|
|
+ `Unused capturing groups generally degrade the performance of regular expressions. `
|
|
+ `They might also be a sign that a backreference is incorrect or that a 'lookbehind: true' declaration in missing.\n`
|
|
+ `To fix this, do one of the following:\n`
|
|
+ fixes.map(f => '- ' + f).join('\n'));
|
|
}
|
|
});
|
|
});
|
|
});
|
|
|
|
it('- should have nice names and aliases', function () {
|
|
const niceName = /^[a-z][a-z\d]*(?:-[a-z\d]+)*$/;
|
|
function testName(name, desc = 'token name') {
|
|
if (!niceName.test(name)) {
|
|
assert.fail(`The ${desc} '${name}' does not match ${niceName}.\n\n`
|
|
+ `To fix this, choose a name that matches the above regular expression.`);
|
|
}
|
|
}
|
|
|
|
forEachPattern(({ name, parent, tokenPath, path }) => {
|
|
// token name
|
|
let offset = 1;
|
|
if (name == 'pattern') { // regex can be inside an object
|
|
offset++;
|
|
}
|
|
if (Array.isArray(path[path.length - 1 - offset].value)) { // regex/regex object can be inside an array
|
|
offset++;
|
|
}
|
|
const patternName = path[path.length - offset].key;
|
|
testName(patternName);
|
|
|
|
// check alias
|
|
if (name == 'pattern' && 'alias' in parent) {
|
|
const alias = parent.alias;
|
|
if (typeof alias === 'string') {
|
|
testName(alias, `alias of '${tokenPath}'`);
|
|
} else if (Array.isArray(alias)) {
|
|
alias.forEach(name => testName(name, `alias of '${tokenPath}'`));
|
|
}
|
|
}
|
|
});
|
|
});
|
|
|
|
it('- should not use octal escapes', function () {
|
|
forEachPattern(({ ast, tokenPath, reportError }) => {
|
|
visitRegExpAST(ast.pattern, {
|
|
onCharacterEnter(node) {
|
|
if (/^\\(?:[1-9]|\d{2,})$/.test(node.raw)) {
|
|
reportError(`${tokenPath}: Octal escape ${node.raw}.\n\n`
|
|
+ `Octal escapes can be confused with backreferences, so please do not use them.\n`
|
|
+ `To fix this, use a different escape method. `
|
|
+ `Note that this could also be an invalid backreference, so be sure to carefully analyse the pattern.`);
|
|
}
|
|
}
|
|
});
|
|
});
|
|
});
|
|
|
|
it('- should not cause exponential backtracking', function () {
|
|
replaceRegExpProto(exec => {
|
|
return function (input) {
|
|
checkExponentialBacktracking('<Unknown>', this);
|
|
return exec.call(this, input);
|
|
};
|
|
}, () => {
|
|
forEachPattern(({ pattern, ast, tokenPath }) => {
|
|
checkExponentialBacktracking(tokenPath, pattern, ast);
|
|
});
|
|
});
|
|
});
|
|
|
|
it('- should not cause polynomial backtracking', function () {
|
|
replaceRegExpProto(exec => {
|
|
return function (input) {
|
|
checkPolynomialBacktracking('<Unknown>', this);
|
|
return exec.call(this, input);
|
|
};
|
|
}, () => {
|
|
forEachPattern(({ pattern, ast, tokenPath }) => {
|
|
checkPolynomialBacktracking(tokenPath, pattern, ast);
|
|
});
|
|
});
|
|
});
|
|
|
|
}
|
|
|
|
|
|
/**
|
|
* Returns the first capturing group in the given pattern.
|
|
*
|
|
* @param {Pattern} pattern
|
|
* @returns {CapturingGroup | undefined}
|
|
*/
|
|
function getFirstCapturingGroup(pattern) {
|
|
let cap = undefined;
|
|
|
|
try {
|
|
visitRegExpAST(pattern, {
|
|
onCapturingGroupEnter(node) {
|
|
cap = node;
|
|
throw new Error('stop');
|
|
}
|
|
});
|
|
} catch (error) {
|
|
// ignore errors
|
|
}
|
|
|
|
return cap;
|
|
}
|
|
|
|
/**
|
|
* Returns whether the given element will always at the start of the whole match.
|
|
*
|
|
* @param {Element} element
|
|
* @returns {boolean}
|
|
*/
|
|
function isFirstMatch(element) {
|
|
const parent = element.parent;
|
|
switch (parent.type) {
|
|
case 'Alternative': {
|
|
// all elements before this element have to of zero length
|
|
if (!parent.elements.slice(0, parent.elements.indexOf(element)).every(RAA.isZeroLength)) {
|
|
return false;
|
|
}
|
|
const grandParent = parent.parent;
|
|
if (grandParent.type === 'Pattern') {
|
|
return true;
|
|
} else {
|
|
return isFirstMatch(grandParent);
|
|
}
|
|
}
|
|
|
|
case 'Quantifier':
|
|
if (parent.max >= 2) {
|
|
return false;
|
|
} else {
|
|
return isFirstMatch(parent);
|
|
}
|
|
|
|
default:
|
|
throw new Error(`Internal error: The given node should not be a '${element.type}'.`);
|
|
}
|
|
}
|
|
|
|
/**
|
|
* Returns whether the given node either is or is a child of what is effectively a Kleene star.
|
|
*
|
|
* @param {import("regexpp/ast").Node} node
|
|
* @returns {boolean}
|
|
*/
|
|
function underAStar(node) {
|
|
return RAA.getEffectiveMaximumRepetition(node) > 10;
|
|
}
|
|
|
|
/**
|
|
* @param {Iterable<T>} iter
|
|
* @returns {T | undefined}
|
|
* @template T
|
|
*/
|
|
function firstOf(iter) {
|
|
const [first] = iter;
|
|
return first;
|
|
}
|
|
|
|
/**
|
|
* A set of all safe (non-exponentially backtracking) RegExp literals (string).
|
|
*
|
|
* @type {Set<string | RegExp>}
|
|
*/
|
|
const expoSafeRegexes = new Set();
|
|
|
|
/** @type {Transformers.CreationOptions} */
|
|
const options = {
|
|
ignoreOrder: true,
|
|
ignoreAmbiguity: true
|
|
};
|
|
const transformer = combineTransformers([
|
|
Transformers.inline(options),
|
|
Transformers.removeDeadBranches(options),
|
|
Transformers.unionCharacters(options),
|
|
Transformers.moveUpEmpty(options),
|
|
Transformers.nestedQuantifiers(options),
|
|
Transformers.sortAssertions(options),
|
|
Transformers.removeUnnecessaryAssertions(options),
|
|
Transformers.applyAssertions(options),
|
|
]);
|
|
|
|
|
|
/**
|
|
* @param {string} path
|
|
* @param {RegExp} pattern
|
|
* @param {LiteralAST} [ast]
|
|
* @returns {void}
|
|
*/
|
|
function checkExponentialBacktracking(path, pattern, ast) {
|
|
if (expoSafeRegexes.has(pattern)) {
|
|
// we know that the pattern won't cause exp backtracking because we checked before
|
|
return;
|
|
}
|
|
const patternStr = String(pattern);
|
|
if (expoSafeRegexes.has(patternStr)) {
|
|
// we know that the pattern won't cause exp backtracking because we checked before
|
|
return;
|
|
}
|
|
|
|
if (!ast) {
|
|
ast = parseRegex(pattern);
|
|
}
|
|
|
|
const parser = JS.Parser.fromAst(ast);
|
|
/**
|
|
* Parses the given element and creates its NFA.
|
|
*
|
|
* @param {import("refa").JS.ParsableElement} element
|
|
* @returns {NFA}
|
|
*/
|
|
function toNFA(element) {
|
|
let { expression, maxCharacter } = parser.parseElement(element, {
|
|
maxBackreferenceWords: 1000,
|
|
backreferences: 'disable'
|
|
});
|
|
|
|
return NFA.fromRegex(transform(transformer, expression), { maxCharacter }, { assertions: 'disable' });
|
|
}
|
|
|
|
/**
|
|
* Checks whether the alternatives of the given node are disjoint. If the alternatives are not disjoint
|
|
* and the give node is a descendant of an effective Kleene star, then an error will be thrown.
|
|
*
|
|
* @param {CapturingGroup | Group | LookaroundAssertion} node
|
|
* @returns {void}
|
|
*/
|
|
function checkDisjointAlternatives(node) {
|
|
if (!underAStar(node) || node.alternatives.length < 2) {
|
|
return;
|
|
}
|
|
|
|
const alternatives = node.alternatives;
|
|
|
|
const total = toNFA(alternatives[0]);
|
|
total.withoutEmptyWord();
|
|
for (let i = 1, l = alternatives.length; i < l; i++) {
|
|
const a = alternatives[i];
|
|
const current = toNFA(a);
|
|
current.withoutEmptyWord();
|
|
|
|
if (!total.isDisjointWith(current)) {
|
|
assert.fail(`${path}: The alternative \`${a.raw}\` is not disjoint with at least one previous alternative.`
|
|
+ ` This will cause exponential backtracking.`
|
|
+ `\n\nTo fix this issue, you have to rewrite the ${node.type} \`${node.raw}\`.`
|
|
+ ` The goal is that all of its alternatives are disjoint.`
|
|
+ ` This means that if a (sub-)string is matched by the ${node.type}, then only one of its alternatives can match the (sub-)string.`
|
|
+ `\n\nExample: \`(?:[ab]|\\w|::)+\``
|
|
+ `\nThe alternatives of the group are not disjoint because the string "a" can be matched by both \`[ab]\` and \`\\w\`.`
|
|
+ ` In this example, the pattern can easily be fixed because the \`[ab]\` is a subset of the \`\\w\`, so its enough to remove the \`[ab]\` alternative to get \`(?:\\w|::)+\` as the fixed pattern.`
|
|
+ `\nIn the real world, patterns can be a lot harder to fix.`
|
|
+ ` If you are trying to make the tests pass for a pull request but can\'t fix the issue yourself, then make the pull request (or commit) anyway.`
|
|
+ ` A maintainer will help you.`
|
|
+ `\n\nFull pattern:\n${pattern}`);
|
|
} else if (i !== l - 1) {
|
|
total.union(current);
|
|
}
|
|
}
|
|
}
|
|
|
|
visitRegExpAST(ast.pattern, {
|
|
onCapturingGroupLeave: checkDisjointAlternatives,
|
|
onGroupLeave: checkDisjointAlternatives,
|
|
onAssertionLeave(node) {
|
|
if (node.kind === 'lookahead' || node.kind === 'lookbehind') {
|
|
checkDisjointAlternatives(node);
|
|
}
|
|
},
|
|
|
|
onQuantifierLeave(node) {
|
|
if (node.max < 10) {
|
|
return; // not a star
|
|
}
|
|
if (node.element.type !== 'CapturingGroup' && node.element.type !== 'Group') {
|
|
return; // not a group
|
|
}
|
|
|
|
// The idea here is the following:
|
|
//
|
|
// We have found a part `A*` of the regex (`A` is assumed to not accept the empty word). Let `I` be
|
|
// the intersection of `A` and `A{2,}`. If `I` is not empty, then there exists a non-empty word `w`
|
|
// that is accepted by both `A` and `A{2,}`. That means that there exists some `m>1` for which `w`
|
|
// is accepted by `A{m}`.
|
|
// This means that there are at least two ways `A*` can accept `w`. It can be accepted as `A` or as
|
|
// `A{m}`. Hence there are at least 2^n ways for `A*` to accept the word `w{n}`. This is the main
|
|
// requirement for exponential backtracking.
|
|
//
|
|
// This is actually only a crude approximation for the real analysis that would have to be done. We
|
|
// would actually have to check the intersection `A{p}` and `A{p+1,}` for all p>0. However, in most
|
|
// cases, the approximation is good enough.
|
|
|
|
const nfa = toNFA(node.element);
|
|
nfa.withoutEmptyWord();
|
|
const twoStar = nfa.copy();
|
|
twoStar.quantify(2, Infinity);
|
|
|
|
if (!nfa.isDisjointWith(twoStar)) {
|
|
const word = Words.pickMostReadableWord(firstOf(getIntersectionWordSets(nfa, twoStar)));
|
|
const example = Words.fromUnicodeToString(word);
|
|
assert.fail(`${path}: The quantifier \`${node.raw}\` ambiguous for all words ${JSON.stringify(example)}.repeat(n) for any n>1.`
|
|
+ ` This will cause exponential backtracking.`
|
|
+ `\n\nTo fix this issue, you have to rewrite the element (let's call it E) of the quantifier.`
|
|
+ ` The goal is modify E such that it is disjoint with repetitions of itself.`
|
|
+ ` This means that if a (sub-)string is matched by E, then it must not be possible for E{2}, E{3}, E{4}, etc. to match that (sub-)string.`
|
|
+ `\n\nExample 1: \`(?:\\w+|::)+\``
|
|
+ `\nThe problem lies in \`\\w+\` because \`\\w+\` and \`(?:\\w+){2}\` are not disjoint as the string "aa" is fully matched by both.`
|
|
+ ` In this example, the pattern can easily be fixed by changing \`\\w+\` to \`\\w\`.`
|
|
+ `\nExample 2: \`(?:\\w|Foo)+\``
|
|
+ `\nThe problem lies in \`\\w\` and \`Foo\` because the string "Foo" can be matched as either repeating \`\\w\` 3 times or by using the \`Foo\` alternative once.`
|
|
+ ` In this example, the pattern can easily be fixed because the \`Foo\` alternative is redundant can can be removed.`
|
|
+ `\nExample 3: \`(?:\\.\\w+(?:<.*?>)?)+\``
|
|
+ `\nThe problem lies in \`<.*?>\`. The string ".a<>.a<>" can be matched as either \`\\. \\w < . . . . >\` or \`\\. \\w < > \\. \\w < >\`.`
|
|
+ ` When it comes to exponential backtracking, it doesn't matter whether a quantifier is greedy or lazy.`
|
|
+ ` This means that the lazy \`.*?\` can jump over \`>\`.`
|
|
+ ` In this example, the pattern can easily be fixed because we just have to prevent \`.*?\` jumping over \`>\`.`
|
|
+ ` This can done by replacing \`<.*?>\` with \`<[^\\r\\n>]*>\`.`
|
|
+ `\n\nIn the real world, patterns can be a lot harder to fix.`
|
|
+ ` If you are trying to make this test pass for a pull request but can\'t fix the issue yourself, then make the pull request (or commit) anyway, a maintainer will help you.`
|
|
+ `\n\nFull pattern:\n${pattern}`);
|
|
}
|
|
},
|
|
});
|
|
|
|
expoSafeRegexes.add(pattern);
|
|
expoSafeRegexes.add(patternStr);
|
|
}
|
|
|
|
/**
|
|
* A set of all safe (non-polynomially backtracking) RegExp literals (string).
|
|
*
|
|
* @type {Set<string | RegExp>}
|
|
*/
|
|
const polySafeRegexes = new Set();
|
|
/**
|
|
* @param {string} path
|
|
* @param {RegExp} pattern
|
|
* @param {LiteralAST} [ast]
|
|
* @returns {void}
|
|
*/
|
|
function checkPolynomialBacktracking(path, pattern, ast) {
|
|
if (polySafeRegexes.has(pattern)) {
|
|
// we know that the pattern won't cause poly backtracking because we checked before
|
|
return;
|
|
}
|
|
const patternStr = String(pattern);
|
|
if (polySafeRegexes.has(patternStr)) {
|
|
// we know that the pattern won't cause poly backtracking because we checked before
|
|
return;
|
|
}
|
|
|
|
if (!ast) {
|
|
ast = parseRegex(pattern);
|
|
}
|
|
|
|
const result = scslre.analyse(ast, { maxReports: 1, reportTypes: { 'Move': false } });
|
|
if (result.reports.length > 0) {
|
|
const report = result.reports[0];
|
|
|
|
let rangeOffset;
|
|
let rangeStr;
|
|
let rangeHighlight;
|
|
|
|
switch (report.type) {
|
|
case 'Trade': {
|
|
const start = Math.min(report.startQuant.start, report.endQuant.start);
|
|
const end = Math.max(report.startQuant.end, report.endQuant.end);
|
|
rangeOffset = start + 1;
|
|
rangeStr = patternStr.substring(start + 1, end + 1);
|
|
rangeHighlight = highlight([
|
|
{ ...report.startQuant, label: 'start' },
|
|
{ ...report.endQuant, label: 'end' }
|
|
], -start);
|
|
break;
|
|
}
|
|
case 'Self': {
|
|
rangeOffset = report.parentQuant.start + 1;
|
|
rangeStr = patternStr.substring(report.parentQuant.start + 1, report.parentQuant.end + 1);
|
|
rangeHighlight = highlight([{ ...report.quant, label: 'self' }], -report.parentQuant.start);
|
|
break;
|
|
}
|
|
case 'Move': {
|
|
rangeOffset = 1;
|
|
rangeStr = patternStr.substring(1, report.quant.end + 1);
|
|
rangeHighlight = highlight([report.quant]);
|
|
break;
|
|
}
|
|
default:
|
|
throw new Error('Invalid report type. This should never happen.');
|
|
}
|
|
|
|
const attackChar = `/${report.character.literal.source}/${report.character.literal.flags}`;
|
|
const fixed = report.fix();
|
|
|
|
assert.fail(
|
|
`${path}: ${report.exponential ? 'Exponential' : 'Polynomial'} backtracking. `
|
|
+ `By repeating any character that matches ${attackChar}, an attack string can be created.`
|
|
+ `\n`
|
|
+ `\n${indent(rangeStr)}`
|
|
+ `\n${indent(rangeHighlight)}`
|
|
+ `\n`
|
|
+ `\nFull pattern:`
|
|
+ `\n${patternStr}`
|
|
+ `\n${indent(rangeHighlight, ' '.repeat(rangeOffset))}`
|
|
+ `\n`
|
|
+ `\n` + (fixed ? `Fixed:\n/${fixed.source}/${fixed.flags}` : `Fix not available.`)
|
|
);
|
|
}
|
|
|
|
polySafeRegexes.add(pattern);
|
|
polySafeRegexes.add(patternStr);
|
|
}
|
|
|
|
/**
|
|
* @param {Highlight[]} highlights
|
|
* @param {number} [offset]
|
|
* @returns {string}
|
|
*
|
|
* @typedef Highlight
|
|
* @property {number} start
|
|
* @property {number} end
|
|
* @property {string} [label]
|
|
*/
|
|
function highlight(highlights, offset = 0) {
|
|
highlights.sort((a, b) => a.start - b.start);
|
|
|
|
const lines = [];
|
|
while (highlights.length > 0) {
|
|
const newHighlights = [];
|
|
let l = '';
|
|
for (const highlight of highlights) {
|
|
const start = highlight.start + offset;
|
|
const end = highlight.end + offset;
|
|
if (start < l.length) {
|
|
newHighlights.push(highlight);
|
|
} else {
|
|
l += ' '.repeat(start - l.length);
|
|
l += '^';
|
|
l += '~'.repeat(end - start - 1);
|
|
if (highlight.label) {
|
|
l += '[' + highlight.label + ']';
|
|
}
|
|
}
|
|
}
|
|
lines.push(l);
|
|
highlights = newHighlights;
|
|
}
|
|
|
|
return lines.join('\n');
|
|
}
|
|
|
|
/**
|
|
* @param {string} str
|
|
* @param {string} amount
|
|
* @returns {string}
|
|
*/
|
|
function indent(str, amount = ' ') {
|
|
return str.split(/\r?\n/).map(m => m === '' ? '' : amount + m).join('\n');
|
|
}
|
|
|
|
/**
|
|
* @param {(exec: RegExp["exec"]) => RegExp["exec"]} execSupplier
|
|
* @param {() => void} fn
|
|
*/
|
|
function replaceRegExpProto(execSupplier, fn) {
|
|
const oldExec = RegExp.prototype.exec;
|
|
const oldTest = RegExp.prototype.test;
|
|
const newExec = execSupplier(oldExec);
|
|
|
|
RegExp.prototype.exec = newExec;
|
|
RegExp.prototype.test = function (input) {
|
|
return newExec.call(this, input) !== null;
|
|
};
|
|
|
|
let error;
|
|
try {
|
|
fn();
|
|
} catch (e) {
|
|
error = e;
|
|
}
|
|
|
|
RegExp.prototype.exec = oldExec;
|
|
RegExp.prototype.test = oldTest;
|
|
|
|
if (error) {
|
|
throw error;
|
|
}
|
|
}
|
|
|
|
/**
|
|
* @param {undefined | null | T | T[]} value
|
|
* @returns {T[]}
|
|
* @template T
|
|
*/
|
|
function toArray(value) {
|
|
if (Array.isArray(value)) {
|
|
return value;
|
|
} else if (value != null) {
|
|
return [value];
|
|
} else {
|
|
return [];
|
|
}
|
|
}
|