Improve code quality and write tests

This commit is contained in:
Shivam Mathur
2019-09-20 08:11:20 +05:30
parent db44db4b97
commit 43178a7254
3597 changed files with 255478 additions and 785554 deletions

83
node_modules/regexp-tree/dist/optimizer/index.js generated vendored Normal file
View File

@ -0,0 +1,83 @@
/**
* The MIT License (MIT)
* Copyright (c) 2017-present Dmitry Soshnikov <dmitry.soshnikov@gmail.com>
*/
'use strict';
var clone = require('../utils/clone');
var parser = require('../parser');
var transform = require('../transform');
var optimizationTransforms = require('./transforms');
module.exports = {
/**
* Optimizer transforms a regular expression into an optimized version,
* replacing some sub-expressions with their idiomatic patterns.
*
* @param string | RegExp | AST - a regexp to optimize.
*
* @return TransformResult - an optimized regexp.
*
* Example:
*
* /[a-zA-Z_0-9][a-zA-Z_0-9]*\e{1,}/
*
* Optimized to:
*
* /\w+e+/
*/
optimize: function optimize(regexp) {
var transformsWhitelist = arguments.length > 1 && arguments[1] !== undefined ? arguments[1] : [];
var transformToApply = transformsWhitelist.length > 0 ? transformsWhitelist : Object.keys(optimizationTransforms);
var ast = regexp;
if (regexp instanceof RegExp) {
regexp = '' + regexp;
}
if (typeof regexp === 'string') {
ast = parser.parse(regexp);
}
var result = new transform.TransformResult(ast);
var prevResultString = void 0;
do {
// Get a copy of the current state here so
// we can compare it with the state at the
// end of the loop.
prevResultString = result.toString();
ast = clone(result.getAST());
transformToApply.forEach(function (transformName) {
if (!optimizationTransforms.hasOwnProperty(transformName)) {
throw new Error('Unknown optimization-transform: ' + transformName + '. ' + 'Available transforms are: ' + Object.keys(optimizationTransforms).join(', '));
}
var transformer = optimizationTransforms[transformName];
// Don't override result just yet since we
// might want to rollback the transform
var newResult = transform.transform(ast, transformer);
if (newResult.toString() !== result.toString()) {
if (newResult.toString().length <= result.toString().length) {
result = newResult;
} else {
// Result has changed but is not shorter:
// restore ast to its previous state.
ast = clone(result.getAST());
}
}
});
// Keep running the optimizer until it stops
// making any change to the regexp.
} while (result.toString() !== prevResultString);
return result;
}
};

View File

@ -0,0 +1,110 @@
/**
* The MIT License (MIT)
* Copyright (c) 2017-present Dmitry Soshnikov <dmitry.soshnikov@gmail.com>
*/
'use strict';
var UPPER_A_CP = 'A'.codePointAt(0);
var UPPER_Z_CP = 'Z'.codePointAt(0);
/**
* Transforms case-insensitive regexp to lowercase
*
* /AaBbÏ/i -> /aabbï/i
*/
module.exports = {
_AZClassRanges: null,
_hasUFlag: false,
init: function init(ast) {
this._AZClassRanges = new Set();
this._hasUFlag = ast.flags.includes('u');
},
shouldRun: function shouldRun(ast) {
return ast.flags.includes('i');
},
Char: function Char(path) {
var node = path.node,
parent = path.parent;
if (isNaN(node.codePoint)) {
return;
}
// Engine support for case-insensitive matching without the u flag
// for characters above \u1000 does not seem reliable.
if (!this._hasUFlag && node.codePoint >= 0x1000) {
return;
}
if (parent.type === 'ClassRange') {
// The only class ranges we handle must be inside A-Z.
// After the `from` char is processed, the isAZClassRange test
// will be false, so we use a Set to keep track of parents and
// process the `to` char.
if (!this._AZClassRanges.has(parent) && !isAZClassRange(parent)) {
return;
}
this._AZClassRanges.add(parent);
}
var lower = node.symbol.toLowerCase();
if (lower !== node.symbol) {
node.value = displaySymbolAsValue(lower, node);
node.symbol = lower;
node.codePoint = lower.codePointAt(0);
}
}
};
function isAZClassRange(classRange) {
var from = classRange.from,
to = classRange.to;
// A-Z
return from.codePoint >= UPPER_A_CP && from.codePoint <= UPPER_Z_CP && to.codePoint >= UPPER_A_CP && to.codePoint <= UPPER_Z_CP;
}
function displaySymbolAsValue(symbol, node) {
var codePoint = symbol.codePointAt(0);
if (node.kind === 'decimal') {
return '\\' + codePoint;
}
if (node.kind === 'oct') {
return '\\0' + codePoint.toString(8);
}
if (node.kind === 'hex') {
return '\\x' + codePoint.toString(16);
}
if (node.kind === 'unicode') {
if (node.isSurrogatePair) {
var _getSurrogatePairFrom = getSurrogatePairFromCodePoint(codePoint),
lead = _getSurrogatePairFrom.lead,
trail = _getSurrogatePairFrom.trail;
return '\\u' + '0'.repeat(4 - lead.length) + lead + '\\u' + '0'.repeat(4 - trail.length) + trail;
} else if (node.value.includes('{')) {
return '\\u{' + codePoint.toString(16) + '}';
} else {
var code = codePoint.toString(16);
return '\\u' + '0'.repeat(4 - code.length) + code;
}
}
// simple
return symbol;
}
/**
* Converts a code point to a surrogate pair.
* Conversion algorithm is taken from The Unicode Standard 3.0 Section 3.7
* (https://www.unicode.org/versions/Unicode3.0.0/ch03.pdf)
* @param {number} codePoint - Between 0x10000 and 0x10ffff
* @returns {{lead: string, trail: string}}
*/
function getSurrogatePairFromCodePoint(codePoint) {
var lead = Math.floor((codePoint - 0x10000) / 0x400) + 0xd800;
var trail = (codePoint - 0x10000) % 0x400 + 0xdc00;
return {
lead: lead.toString(16),
trail: trail.toString(16)
};
}

View File

@ -0,0 +1,339 @@
/**
* The MIT License (MIT)
* Copyright (c) 2017-present Dmitry Soshnikov <dmitry.soshnikov@gmail.com>
*/
'use strict';
/**
* A regexp-tree plugin to merge class ranges.
*
* [a-ec] -> [a-e]
* [a-ec-e] -> [a-e]
* [\w\da-f] -> [\w]
* [abcdef] -> [a-f]
*/
module.exports = {
_hasIUFlags: false,
init: function init(ast) {
this._hasIUFlags = ast.flags.includes('i') && ast.flags.includes('u');
},
CharacterClass: function CharacterClass(path) {
var node = path.node;
var expressions = node.expressions;
var metas = [];
// Extract metas
expressions.forEach(function (expression) {
if (isMeta(expression)) {
metas.push(expression.value);
}
});
expressions.sort(sortCharClass);
for (var i = 0; i < expressions.length; i++) {
var expression = expressions[i];
if (fitsInMetas(expression, metas, this._hasIUFlags) || combinesWithPrecedingClassRange(expression, expressions[i - 1]) || combinesWithFollowingClassRange(expression, expressions[i + 1])) {
expressions.splice(i, 1);
i--;
} else {
var nbMergedChars = charCombinesWithPrecedingChars(expression, i, expressions);
expressions.splice(i - nbMergedChars + 1, nbMergedChars);
i -= nbMergedChars;
}
}
}
};
/**
* Sorts expressions in char class in the following order:
* - meta chars, ordered alphabetically by value
* - chars (except `control` kind) and class ranges, ordered alphabetically (`from` char is used for class ranges)
* - if ambiguous, class range comes before char
* - if ambiguous between two class ranges, orders alphabetically by `to` char
* - control chars, ordered alphabetically by value
* @param {Object} a - Left Char or ClassRange node
* @param {Object} b - Right Char or ClassRange node
* @returns {number}
*/
function sortCharClass(a, b) {
var aValue = getSortValue(a);
var bValue = getSortValue(b);
if (aValue === bValue) {
// We want ClassRange before Char
// [bb-d] -> [b-db]
if (a.type === 'ClassRange' && b.type !== 'ClassRange') {
return -1;
}
if (b.type === 'ClassRange' && a.type !== 'ClassRange') {
return 1;
}
if (a.type === 'ClassRange' && b.type === 'ClassRange') {
return getSortValue(a.to) - getSortValue(b.to);
}
if (isMeta(a) && isMeta(b) || isControl(a) && isControl(b)) {
return a.value < b.value ? -1 : 1;
}
}
return aValue - bValue;
}
/**
* @param {Object} expression - Char or ClassRange node
* @returns {number}
*/
function getSortValue(expression) {
if (expression.type === 'Char') {
if (expression.kind === 'control') {
return Infinity;
}
if (expression.kind === 'meta' && isNaN(expression.codePoint)) {
return -1;
}
return expression.codePoint;
}
// ClassRange
return expression.from.codePoint;
}
/**
* Checks if a node is a meta char from the set \d\w\s\D\W\S
* @param {Object} expression - Char or ClassRange node
* @param {?string} value
* @returns {boolean}
*/
function isMeta(expression) {
var value = arguments.length > 1 && arguments[1] !== undefined ? arguments[1] : null;
return expression.type === 'Char' && expression.kind === 'meta' && (value ? expression.value === value : /^\\[dws]$/i.test(expression.value));
}
/**
* @param {Object} expression - Char or ClassRange node
* @returns {boolean}
*/
function isControl(expression) {
return expression.type === 'Char' && expression.kind === 'control';
}
/**
* @param {Object} expression - Char or ClassRange node
* @param {string[]} metas - Array of meta chars, e.g. ["\\w", "\\s"]
* @param {boolean} hasIUFlags
* @returns {boolean}
*/
function fitsInMetas(expression, metas, hasIUFlags) {
for (var i = 0; i < metas.length; i++) {
if (fitsInMeta(expression, metas[i], hasIUFlags)) {
return true;
}
}
return false;
}
/**
* @param {Object} expression - Char or ClassRange node
* @param {string} meta - e.g. "\\w"
* @param {boolean} hasIUFlags
* @returns {boolean}
*/
function fitsInMeta(expression, meta, hasIUFlags) {
if (expression.type === 'ClassRange') {
return fitsInMeta(expression.from, meta, hasIUFlags) && fitsInMeta(expression.to, meta, hasIUFlags);
}
// Special cases:
// \S contains \w and \d
if (meta === '\\S' && (isMeta(expression, '\\w') || isMeta(expression, '\\d'))) {
return true;
}
// \D contains \W and \s
if (meta === '\\D' && (isMeta(expression, '\\W') || isMeta(expression, '\\s'))) {
return true;
}
// \w contains \d
if (meta === '\\w' && isMeta(expression, '\\d')) {
return true;
}
// \W contains \s
if (meta === '\\W' && isMeta(expression, '\\s')) {
return true;
}
if (expression.type !== 'Char' || isNaN(expression.codePoint)) {
return false;
}
if (meta === '\\s') {
return fitsInMetaS(expression);
}
if (meta === '\\S') {
return !fitsInMetaS(expression);
}
if (meta === '\\d') {
return fitsInMetaD(expression);
}
if (meta === '\\D') {
return !fitsInMetaD(expression);
}
if (meta === '\\w') {
return fitsInMetaW(expression, hasIUFlags);
}
if (meta === '\\W') {
return !fitsInMetaW(expression, hasIUFlags);
}
return false;
}
/**
* @param {Object} expression - Char node with codePoint
* @returns {boolean}
*/
function fitsInMetaS(expression) {
return expression.codePoint === 0x0009 || // \t
expression.codePoint === 0x000a || // \n
expression.codePoint === 0x000b || // \v
expression.codePoint === 0x000c || // \f
expression.codePoint === 0x000d || // \r
expression.codePoint === 0x0020 || // space
expression.codePoint === 0x00a0 || // nbsp
expression.codePoint === 0x1680 || // part of Zs
expression.codePoint >= 0x2000 && expression.codePoint <= 0x200a || // part of Zs
expression.codePoint === 0x2028 || // line separator
expression.codePoint === 0x2029 || // paragraph separator
expression.codePoint === 0x202f || // part of Zs
expression.codePoint === 0x205f || // part of Zs
expression.codePoint === 0x3000 || // part of Zs
expression.codePoint === 0xfeff; // zwnbsp
}
/**
* @param {Object} expression - Char node with codePoint
* @returns {boolean}
*/
function fitsInMetaD(expression) {
return expression.codePoint >= 0x30 && expression.codePoint <= 0x39; // 0-9
}
/**
* @param {Object} expression - Char node with codePoint
* @param {boolean} hasIUFlags
* @returns {boolean}
*/
function fitsInMetaW(expression, hasIUFlags) {
return fitsInMetaD(expression) || expression.codePoint >= 0x41 && expression.codePoint <= 0x5a || // A-Z
expression.codePoint >= 0x61 && expression.codePoint <= 0x7a || // a-z
expression.value === '_' || hasIUFlags && (expression.codePoint === 0x017f || expression.codePoint === 0x212a);
}
/**
* @param {Object} expression - Char or ClassRange node
* @param {Object} classRange - Char or ClassRange node
* @returns {boolean}
*/
function combinesWithPrecedingClassRange(expression, classRange) {
if (classRange && classRange.type === 'ClassRange') {
if (fitsInClassRange(expression, classRange)) {
// [a-gc] -> [a-g]
// [a-gc-e] -> [a-g]
return true;
} else if (
// We only want \w chars or char codes to keep readability
isMetaWCharOrCode(expression) && classRange.to.codePoint === expression.codePoint - 1) {
// [a-de] -> [a-e]
classRange.to = expression;
return true;
} else if (expression.type === 'ClassRange' && expression.from.codePoint <= classRange.to.codePoint + 1 && expression.to.codePoint >= classRange.from.codePoint - 1) {
// [a-db-f] -> [a-f]
// [b-fa-d] -> [a-f]
// [a-cd-f] -> [a-f]
if (expression.from.codePoint < classRange.from.codePoint) {
classRange.from = expression.from;
}
if (expression.to.codePoint > classRange.to.codePoint) {
classRange.to = expression.to;
}
return true;
}
}
return false;
}
/**
* @param {Object} expression - Char or ClassRange node
* @param {Object} classRange - Char or ClassRange node
* @returns {boolean}
*/
function combinesWithFollowingClassRange(expression, classRange) {
if (classRange && classRange.type === 'ClassRange') {
// Considering the elements were ordered alphabetically,
// there is only one case to handle
// [ab-e] -> [a-e]
if (
// We only want \w chars or char codes to keep readability
isMetaWCharOrCode(expression) && classRange.from.codePoint === expression.codePoint + 1) {
classRange.from = expression;
return true;
}
}
return false;
}
/**
* @param {Object} expression - Char or ClassRange node
* @param {Object} classRange - ClassRange node
* @returns {boolean}
*/
function fitsInClassRange(expression, classRange) {
if (expression.type === 'Char' && isNaN(expression.codePoint)) {
return false;
}
if (expression.type === 'ClassRange') {
return fitsInClassRange(expression.from, classRange) && fitsInClassRange(expression.to, classRange);
}
return expression.codePoint >= classRange.from.codePoint && expression.codePoint <= classRange.to.codePoint;
}
/**
* @param {Object} expression - Char or ClassRange node
* @param {Number} index
* @param {Object[]} expressions - expressions in CharClass
* @returns {number} - Number of characters combined with expression
*/
function charCombinesWithPrecedingChars(expression, index, expressions) {
// We only want \w chars or char codes to keep readability
if (!isMetaWCharOrCode(expression)) {
return 0;
}
var nbMergedChars = 0;
while (index > 0) {
var currentExpression = expressions[index];
var precedingExpresion = expressions[index - 1];
if (isMetaWCharOrCode(precedingExpresion) && precedingExpresion.codePoint === currentExpression.codePoint - 1) {
nbMergedChars++;
index--;
} else {
break;
}
}
if (nbMergedChars > 1) {
expressions[index] = {
type: 'ClassRange',
from: expressions[index],
to: expression
};
return nbMergedChars;
}
return 0;
}
function isMetaWCharOrCode(expression) {
return expression && expression.type === 'Char' && !isNaN(expression.codePoint) && (fitsInMetaW(expression, false) || expression.kind === 'unicode' || expression.kind === 'hex' || expression.kind === 'oct' || expression.kind === 'decimal');
}

View File

@ -0,0 +1,30 @@
/**
* The MIT License (MIT)
* Copyright (c) 2017-present Dmitry Soshnikov <dmitry.soshnikov@gmail.com>
*/
'use strict';
/**
* A regexp-tree plugin to simplify character classes
* spanning only one or two chars.
*
* [a-a] -> [a]
* [a-b] -> [ab]
*/
module.exports = {
ClassRange: function ClassRange(path) {
var node = path.node;
if (node.from.codePoint === node.to.codePoint) {
path.replace(node.from);
} else if (node.from.codePoint === node.to.codePoint - 1) {
path.getParent().insertChildAt(node.to, path.index + 1);
path.replace(node.from);
}
}
};

View File

@ -0,0 +1,33 @@
/**
* The MIT License (MIT)
* Copyright (c) 2017-present Dmitry Soshnikov <dmitry.soshnikov@gmail.com>
*/
'use strict';
/**
* A regexp-tree plugin to remove duplicates from character classes.
*/
module.exports = {
CharacterClass: function CharacterClass(path) {
var node = path.node;
var sources = {};
for (var i = 0; i < node.expressions.length; i++) {
var childPath = path.getChild(i);
var source = childPath.jsonEncode();
if (sources.hasOwnProperty(source)) {
childPath.remove();
// Since we remove the current node.
// TODO: make it simpler for users with a method.
i--;
}
sources[source] = true;
}
}
};

View File

@ -0,0 +1,211 @@
/**
* The MIT License (MIT)
* Copyright (c) 2017-present Dmitry Soshnikov <dmitry.soshnikov@gmail.com>
*/
'use strict';
/**
* A regexp-tree plugin to replace standard character classes with
* their meta symbols equivalents.
*/
module.exports = {
_hasIFlag: false,
_hasUFlag: false,
init: function init(ast) {
this._hasIFlag = ast.flags.includes('i');
this._hasUFlag = ast.flags.includes('u');
},
CharacterClass: function CharacterClass(path) {
// [0-9] -> \d
rewriteNumberRanges(path);
// [a-zA-Z_0-9] -> \w
rewriteWordRanges(path, this._hasIFlag, this._hasUFlag);
// [ \t\r\n\f] -> \s
rewriteWhitespaceRanges(path);
}
};
/**
* Rewrites number ranges: [0-9] -> \d
*/
function rewriteNumberRanges(path) {
var node = path.node;
node.expressions.forEach(function (expression, i) {
if (isFullNumberRange(expression)) {
path.getChild(i).replace({
type: 'Char',
value: '\\d',
kind: 'meta'
});
}
});
}
/**
* Rewrites word ranges: [a-zA-Z_0-9] -> \w
* Thus, the ranges may go in any order, and other symbols/ranges
* are kept untouched, e.g. [a-z_\dA-Z$] -> [\w$]
*/
function rewriteWordRanges(path, hasIFlag, hasUFlag) {
var node = path.node;
var numberPath = null;
var lowerCasePath = null;
var upperCasePath = null;
var underscorePath = null;
var u017fPath = null;
var u212aPath = null;
node.expressions.forEach(function (expression, i) {
// \d
if (isMetaChar(expression, '\\d')) {
numberPath = path.getChild(i);
}
// a-z
else if (isLowerCaseRange(expression)) {
lowerCasePath = path.getChild(i);
}
// A-Z
else if (isUpperCaseRange(expression)) {
upperCasePath = path.getChild(i);
}
// _
else if (isUnderscore(expression)) {
underscorePath = path.getChild(i);
} else if (hasIFlag && hasUFlag && isU017fPath(expression)) {
u017fPath = path.getChild(i);
} else if (hasIFlag && hasUFlag && isU212aPath(expression)) {
u212aPath = path.getChild(i);
}
});
// If we found the whole pattern, replace it.
if (numberPath && (lowerCasePath && upperCasePath || hasIFlag && (lowerCasePath || upperCasePath)) && underscorePath && (!hasUFlag || !hasIFlag || u017fPath && u212aPath)) {
// Put \w in place of \d.
numberPath.replace({
type: 'Char',
value: '\\w',
kind: 'meta'
});
// Other paths are removed.
if (lowerCasePath) {
lowerCasePath.remove();
}
if (upperCasePath) {
upperCasePath.remove();
}
underscorePath.remove();
if (u017fPath) {
u017fPath.remove();
}
if (u212aPath) {
u212aPath.remove();
}
}
}
/**
* Rewrites whitespace ranges: [ \t\r\n\f] -> \s.
*/
function rewriteWhitespaceRanges(path) {
var node = path.node;
var spacePath = null;
var tPath = null;
var nPath = null;
var rPath = null;
var fPath = null;
node.expressions.forEach(function (expression, i) {
// Space
if (isChar(expression, ' ')) {
spacePath = path.getChild(i);
}
// \t
else if (isMetaChar(expression, '\\t')) {
tPath = path.getChild(i);
}
// \n
else if (isMetaChar(expression, '\\n')) {
nPath = path.getChild(i);
}
// \r
else if (isMetaChar(expression, '\\r')) {
rPath = path.getChild(i);
}
// \f
else if (isMetaChar(expression, '\\f')) {
fPath = path.getChild(i);
}
});
// If we found the whole pattern, replace it.
// Make \f optional.
if (spacePath && tPath && nPath && rPath) {
// Put \s in place of \n.
nPath.node.value = '\\s';
// Other paths are removed.
spacePath.remove();
tPath.remove();
rPath.remove();
if (fPath) {
fPath.remove();
}
}
}
function isFullNumberRange(node) {
return node.type === 'ClassRange' && node.from.value === '0' && node.to.value === '9';
}
function isChar(node, value) {
var kind = arguments.length > 2 && arguments[2] !== undefined ? arguments[2] : 'simple';
return node.type === 'Char' && node.value === value && node.kind === kind;
}
function isMetaChar(node, value) {
return isChar(node, value, 'meta');
}
function isLowerCaseRange(node) {
return node.type === 'ClassRange' && node.from.value === 'a' && node.to.value === 'z';
}
function isUpperCaseRange(node) {
return node.type === 'ClassRange' && node.from.value === 'A' && node.to.value === 'Z';
}
function isUnderscore(node) {
return node.type === 'Char' && node.value === '_' && node.kind === 'simple';
}
function isU017fPath(node) {
return node.type === 'Char' && node.kind === 'unicode' && node.codePoint === 0x017f;
}
function isU212aPath(node) {
return node.type === 'Char' && node.kind === 'unicode' && node.codePoint === 0x212a;
}

View File

@ -0,0 +1,71 @@
/**
* The MIT License (MIT)
* Copyright (c) 2017-present Dmitry Soshnikov <dmitry.soshnikov@gmail.com>
*/
'use strict';
/**
* A regexp-tree plugin to replace single char character classes with
* just that character.
*
* [\d] -> \d, [^\w] -> \W
*/
module.exports = {
CharacterClass: function CharacterClass(path) {
var node = path.node;
if (node.expressions.length !== 1 || !isAppropriateChar(node.expressions[0])) {
return;
}
var _node$expressions$ = node.expressions[0],
value = _node$expressions$.value,
kind = _node$expressions$.kind,
escaped = _node$expressions$.escaped;
if (node.negative) {
// For negative can extract only meta chars like [^\w] -> \W
// cannot do for [^a] -> a (wrong).
if (!isMeta(value)) {
return;
}
value = getInverseMeta(value);
}
path.replace({
type: 'Char',
value: value,
kind: kind,
escaped: escaped || shouldEscape(value)
});
}
};
function isAppropriateChar(node) {
return node.type === 'Char' &&
// We don't extract [\b] (backspace) since \b has different
// semantics (word boundary).
node.value !== '\\b';
}
function isMeta(value) {
return (/^\\[dwsDWS]$/.test(value)
);
}
function getInverseMeta(value) {
return (/[dws]/.test(value) ? value.toUpperCase() : value.toLowerCase()
);
}
// Note: \{ and \} are always preserved to avoid `a[{]2[}]` turning
// into `a{2}`.
function shouldEscape(value) {
return (/[*[()+?$./{}|]/.test(value)
);
}

View File

@ -0,0 +1,84 @@
/**
* The MIT License (MIT)
* Copyright (c) 2017-present Dmitry Soshnikov <dmitry.soshnikov@gmail.com>
*/
'use strict';
var UPPER_A_CP = 'A'.codePointAt(0);
var UPPER_Z_CP = 'Z'.codePointAt(0);
var LOWER_A_CP = 'a'.codePointAt(0);
var LOWER_Z_CP = 'z'.codePointAt(0);
var DIGIT_0_CP = '0'.codePointAt(0);
var DIGIT_9_CP = '9'.codePointAt(0);
/**
* A regexp-tree plugin to transform coded chars into simple chars.
*
* \u0061 -> a
*/
module.exports = {
Char: function Char(path) {
var node = path.node,
parent = path.parent;
if (isNaN(node.codePoint) || node.kind === 'simple') {
return;
}
if (parent.type === 'ClassRange') {
if (!isSimpleRange(parent)) {
return;
}
}
if (!isPrintableASCIIChar(node.codePoint)) {
return;
}
var symbol = String.fromCodePoint(node.codePoint);
var newChar = {
type: 'Char',
kind: 'simple',
value: symbol,
symbol: symbol,
codePoint: node.codePoint
};
if (needsEscape(symbol, parent.type)) {
newChar.escaped = true;
}
path.replace(newChar);
}
};
/**
* Checks if a range is included either in 0-9, a-z or A-Z
* @param classRange
* @returns {boolean}
*/
function isSimpleRange(classRange) {
var from = classRange.from,
to = classRange.to;
return from.codePoint >= DIGIT_0_CP && from.codePoint <= DIGIT_9_CP && to.codePoint >= DIGIT_0_CP && to.codePoint <= DIGIT_9_CP || from.codePoint >= UPPER_A_CP && from.codePoint <= UPPER_Z_CP && to.codePoint >= UPPER_A_CP && to.codePoint <= UPPER_Z_CP || from.codePoint >= LOWER_A_CP && from.codePoint <= LOWER_Z_CP && to.codePoint >= LOWER_A_CP && to.codePoint <= LOWER_Z_CP;
}
/**
* Checks if a code point in the range of printable ASCII chars
* (DEL char excluded)
* @param codePoint
* @returns {boolean}
*/
function isPrintableASCIIChar(codePoint) {
return codePoint >= 0x20 && codePoint <= 0x7e;
}
function needsEscape(symbol, parentType) {
if (parentType === 'ClassRange' || parentType === 'CharacterClass') {
return (/[\]\\^-]/.test(symbol)
);
}
return (/[*[()+?^$./\\|{}]/.test(symbol)
);
}

View File

@ -0,0 +1,143 @@
/**
* The MIT License (MIT)
* Copyright (c) 2017-present Dmitry Soshnikov <dmitry.soshnikov@gmail.com>
*/
'use strict';
/**
* A regexp-tree plugin to remove unnecessary escape.
*
* \e -> e
*
* [\(] -> [(]
*/
module.exports = {
_hasXFlag: false,
init: function init(ast) {
this._hasXFlag = ast.flags.includes('x');
},
Char: function Char(path) {
var node = path.node;
if (!node.escaped) {
return;
}
if (shouldUnescape(path, this._hasXFlag)) {
delete node.escaped;
}
}
};
function shouldUnescape(path, hasXFlag) {
var value = path.node.value,
index = path.index,
parent = path.parent;
// In char class (, etc are allowed.
if (parent.type !== 'CharacterClass' && parent.type !== 'ClassRange') {
return !preservesEscape(value, index, parent, hasXFlag);
}
return !preservesInCharClass(value, index, parent);
}
/**
* \], \\, \^, \-
*/
function preservesInCharClass(value, index, parent) {
if (value === '^') {
// Avoid [\^a] turning into [^a]
return index === 0 && !parent.negative;
}
if (value === '-') {
// Avoid [a\-z] turning into [a-z]
return index !== 0 && index !== parent.expressions.length - 1;
}
return (/[\]\\]/.test(value)
);
}
function preservesEscape(value, index, parent, hasXFlag) {
if (value === '{') {
return preservesOpeningCurlyBraceEscape(index, parent);
}
if (value === '}') {
return preservesClosingCurlyBraceEscape(index, parent);
}
if (hasXFlag && /[ #]/.test(value)) {
return true;
}
return (/[*[()+?^$./\\|]/.test(value)
);
}
function consumeNumbers(startIndex, parent, rtl) {
var i = startIndex;
var siblingNode = (rtl ? i >= 0 : i < parent.expressions.length) && parent.expressions[i];
while (siblingNode && siblingNode.type === 'Char' && siblingNode.kind === 'simple' && !siblingNode.escaped && /\d/.test(siblingNode.value)) {
rtl ? i-- : i++;
siblingNode = (rtl ? i >= 0 : i < parent.expressions.length) && parent.expressions[i];
}
return Math.abs(startIndex - i);
}
function isSimpleChar(node, value) {
return node && node.type === 'Char' && node.kind === 'simple' && !node.escaped && node.value === value;
}
function preservesOpeningCurlyBraceEscape(index, parent) {
var nbFollowingNumbers = consumeNumbers(index + 1, parent);
var i = index + nbFollowingNumbers + 1;
var nextSiblingNode = i < parent.expressions.length && parent.expressions[i];
if (nbFollowingNumbers) {
// Avoid \{3} turning into {3}
if (isSimpleChar(nextSiblingNode, '}')) {
return true;
}
if (isSimpleChar(nextSiblingNode, ',')) {
nbFollowingNumbers = consumeNumbers(i + 1, parent);
i = i + nbFollowingNumbers + 1;
nextSiblingNode = i < parent.expressions.length && parent.expressions[i];
// Avoid \{3,} turning into {3,}
return isSimpleChar(nextSiblingNode, '}');
}
}
return false;
}
function preservesClosingCurlyBraceEscape(index, parent) {
var nbPrecedingNumbers = consumeNumbers(index - 1, parent, true);
var i = index - nbPrecedingNumbers - 1;
var previousSiblingNode = i >= 0 && parent.expressions[i];
// Avoid {3\} turning into {3}
if (nbPrecedingNumbers && isSimpleChar(previousSiblingNode, '{')) {
return true;
}
if (isSimpleChar(previousSiblingNode, ',')) {
nbPrecedingNumbers = consumeNumbers(i - 1, parent, true);
i = i - nbPrecedingNumbers - 1;
previousSiblingNode = i < parent.expressions.length && parent.expressions[i];
// Avoid {3,\} turning into {3,}
return nbPrecedingNumbers && isSimpleChar(previousSiblingNode, '{');
}
return false;
}

View File

@ -0,0 +1,27 @@
/**
* The MIT License (MIT)
* Copyright (c) 2017-present Dmitry Soshnikov <dmitry.soshnikov@gmail.com>
*/
'use strict';
/**
* A regexp-tree plugin to transform surrogate pairs into single unicode code point
*
* \ud83d\ude80 -> \u{1f680}
*/
module.exports = {
shouldRun: function shouldRun(ast) {
return ast.flags.includes('u');
},
Char: function Char(path) {
var node = path.node;
if (node.kind !== 'unicode' || !node.isSurrogatePair || isNaN(node.codePoint)) {
return;
}
node.value = '\\u{' + node.codePoint.toString(16) + '}';
delete node.isSurrogatePair;
}
};

View File

@ -0,0 +1,195 @@
/**
* The MIT License (MIT)
* Copyright (c) 2017-present Dmitry Soshnikov <dmitry.soshnikov@gmail.com>
*/
'use strict';
function _toConsumableArray(arr) { if (Array.isArray(arr)) { for (var i = 0, arr2 = Array(arr.length); i < arr.length; i++) { arr2[i] = arr[i]; } return arr2; } else { return Array.from(arr); } }
var NodePath = require('../../traverse/node-path');
var _require = require('../../transform/utils'),
increaseQuantifierByOne = _require.increaseQuantifierByOne;
/**
* A regexp-tree plugin to combine repeating patterns.
*
* /^abcabcabc/ -> /^abc{3}/
* /^(?:abc){2}abc/ -> /^(?:abc){3}/
* /^abc(?:abc){2}/ -> /^(?:abc){3}/
*/
module.exports = {
Alternative: function Alternative(path) {
var node = path.node;
// We can skip the first child
var index = 1;
while (index < node.expressions.length) {
var child = path.getChild(index);
index = Math.max(1, combineRepeatingPatternLeft(path, child, index));
if (index >= node.expressions.length) {
break;
}
child = path.getChild(index);
index = Math.max(1, combineWithPreviousRepetition(path, child, index));
if (index >= node.expressions.length) {
break;
}
child = path.getChild(index);
index = Math.max(1, combineRepetitionWithPrevious(path, child, index));
index++;
}
}
};
// abcabc -> (?:abc){2}
function combineRepeatingPatternLeft(alternative, child, index) {
var node = alternative.node;
var nbPossibleLengths = Math.ceil(index / 2);
var i = 0;
while (i < nbPossibleLengths) {
var startIndex = index - 2 * i - 1;
var right = void 0,
left = void 0;
if (i === 0) {
right = child;
left = alternative.getChild(startIndex);
} else {
right = NodePath.getForNode({
type: 'Alternative',
expressions: [].concat(_toConsumableArray(node.expressions.slice(index - i, index)), [child.node])
});
left = NodePath.getForNode({
type: 'Alternative',
expressions: [].concat(_toConsumableArray(node.expressions.slice(startIndex, index - i)))
});
}
if (right.hasEqualSource(left)) {
for (var j = 0; j < 2 * i + 1; j++) {
alternative.getChild(startIndex).remove();
}
child.replace({
type: 'Repetition',
expression: i === 0 ? right.node : {
type: 'Group',
capturing: false,
expression: right.node
},
quantifier: {
type: 'Quantifier',
kind: 'Range',
from: 2,
to: 2,
greedy: true
}
});
return startIndex;
}
i++;
}
return index;
}
// (?:abc){2}abc -> (?:abc){3}
function combineWithPreviousRepetition(alternative, child, index) {
var node = alternative.node;
var i = 0;
while (i < index) {
var previousChild = alternative.getChild(i);
if (previousChild.node.type === 'Repetition' && previousChild.node.quantifier.greedy) {
var left = previousChild.getChild();
var right = void 0;
if (left.node.type === 'Group' && !left.node.capturing) {
left = left.getChild();
}
if (i + 1 === index) {
right = child;
if (right.node.type === 'Group' && !right.node.capturing) {
right = right.getChild();
}
} else {
right = NodePath.getForNode({
type: 'Alternative',
expressions: [].concat(_toConsumableArray(node.expressions.slice(i + 1, index + 1)))
});
}
if (left.hasEqualSource(right)) {
for (var j = i; j < index; j++) {
alternative.getChild(i + 1).remove();
}
increaseQuantifierByOne(previousChild.node.quantifier);
return i;
}
}
i++;
}
return index;
}
// abc(?:abc){2} -> (?:abc){3}
function combineRepetitionWithPrevious(alternative, child, index) {
var node = alternative.node;
if (child.node.type === 'Repetition' && child.node.quantifier.greedy) {
var right = child.getChild();
var left = void 0;
if (right.node.type === 'Group' && !right.node.capturing) {
right = right.getChild();
}
var rightLength = void 0;
if (right.node.type === 'Alternative') {
rightLength = right.node.expressions.length;
left = NodePath.getForNode({
type: 'Alternative',
expressions: [].concat(_toConsumableArray(node.expressions.slice(index - rightLength, index)))
});
} else {
rightLength = 1;
left = alternative.getChild(index - 1);
if (left.node.type === 'Group' && !left.node.capturing) {
left = left.getChild();
}
}
if (left.hasEqualSource(right)) {
for (var j = index - rightLength; j < index; j++) {
alternative.getChild(index - rightLength).remove();
}
increaseQuantifierByOne(child.node.quantifier);
return index - rightLength;
}
}
return index;
}

View File

@ -0,0 +1,44 @@
/**
* The MIT License (MIT)
* Copyright (c) 2017-present Dmitry Soshnikov <dmitry.soshnikov@gmail.com>
*/
'use strict';
var NodePath = require('../../traverse/node-path');
var _require = require('../../transform/utils'),
disjunctionToList = _require.disjunctionToList,
listToDisjunction = _require.listToDisjunction;
/**
* Removes duplicates from a disjunction sequence:
*
* /(ab|bc|ab)+(xy|xy)+/ -> /(ab|bc)+(xy)+/
*/
module.exports = {
Disjunction: function Disjunction(path) {
var node = path.node;
// Make unique nodes.
var uniqueNodesMap = {};
var parts = disjunctionToList(node).filter(function (part) {
var encoded = part ? NodePath.getForNode(part).jsonEncode() : 'null';
// Already recorded this part, filter out.
if (uniqueNodesMap.hasOwnProperty(encoded)) {
return false;
}
uniqueNodesMap[encoded] = part;
return true;
});
// Replace with the optimized disjunction.
path.replace(listToDisjunction(parts));
}
};

View File

@ -0,0 +1,92 @@
/**
* The MIT License (MIT)
* Copyright (c) 2017-present Dmitry Soshnikov <dmitry.soshnikov@gmail.com>
*/
'use strict';
/**
* A regexp-tree plugin to replace single char group disjunction to char group
*
* a|b|c -> [abc]
* [12]|3|4 -> [1234]
* (a|b|c) -> ([abc])
* (?:a|b|c) -> [abc]
*/
module.exports = {
Disjunction: function Disjunction(path) {
var node = path.node,
parent = path.parent;
if (!handlers[parent.type]) {
return;
}
var charset = new Map();
if (!shouldProcess(node, charset) || !charset.size) {
return;
}
var characterClass = {
type: 'CharacterClass',
expressions: Array.from(charset.keys()).sort().map(function (key) {
return charset.get(key);
})
};
handlers[parent.type](path.getParent(), characterClass);
}
};
var handlers = {
RegExp: function RegExp(path, characterClass) {
var node = path.node;
node.body = characterClass;
},
Group: function Group(path, characterClass) {
var node = path.node;
if (node.capturing) {
node.expression = characterClass;
} else {
path.replace(characterClass);
}
}
};
function shouldProcess(expression, charset) {
if (!expression) {
// Abort on empty disjunction part
return false;
}
var type = expression.type;
if (type === 'Disjunction') {
var left = expression.left,
right = expression.right;
return shouldProcess(left, charset) && shouldProcess(right, charset);
} else if (type === 'Char') {
var value = expression.value;
charset.set(value, expression);
return true;
} else if (type === 'CharacterClass') {
return expression.expressions.every(function (expression) {
return shouldProcess(expression, charset);
});
}
return false;
}

View File

@ -0,0 +1,56 @@
/**
* The MIT License (MIT)
* Copyright (c) 2017-present Dmitry Soshnikov <dmitry.soshnikov@gmail.com>
*/
'use strict';
module.exports = {
// \ud83d\ude80 -> \u{1f680}
'charSurrogatePairToSingleUnicode': require('./char-surrogate-pair-to-single-unicode-transform'),
// \u0061 -> a
'charCodeToSimpleChar': require('./char-code-to-simple-char-transform'),
// /Aa/i -> /aa/i
'charCaseInsensitiveLowerCaseTransform': require('./char-case-insensitive-lowercase-transform'),
// [\d\d] -> [\d]
'charClassRemoveDuplicates': require('./char-class-remove-duplicates-transform'),
// a{1,2}a{2,3} -> a{3,5}
'quantifiersMerge': require('./quantifiers-merge-transform'),
// a{1,} -> a+, a{3,3} -> a{3}, a{1} -> a
'quantifierRangeToSymbol': require('./quantifier-range-to-symbol-transform'),
// [a-a] -> [a], [a-b] -> [ab]
'charClassClassrangesToChars': require('./char-class-classranges-to-chars-transform'),
// [a-de-f] -> [a-f]
'charClassClassrangesMerge': require('./char-class-classranges-merge-transform'),
// [0-9] -> [\d]
'charClassToMeta': require('./char-class-to-meta-transform'),
// [\d] -> \d, [^\w] -> \W
'charClassToSingleChar': require('./char-class-to-single-char-transform'),
// \e -> e
'charEscapeUnescape': require('./char-escape-unescape-transform'),
// (ab|ab) -> (ab)
'disjunctionRemoveDuplicates': require('./disjunction-remove-duplicates-transform'),
// (a|b|c) -> [abc]
'groupSingleCharsToCharClass': require('./group-single-chars-to-char-class'),
// (?:)a -> a
'removeEmptyGroup': require('./remove-empty-group-transform'),
// (?:a) -> a
'ungroup': require('./ungroup-transform'),
// abcabcabc -> (?:abc){3}
'combineRepeatingPatterns': require('./combine-repeating-patterns-transform')
};

View File

@ -0,0 +1,74 @@
/**
* The MIT License (MIT)
* Copyright (c) 2017-present Dmitry Soshnikov <dmitry.soshnikov@gmail.com>
*/
'use strict';
/**
* A regexp-tree plugin to replace different range-based quantifiers
* with their symbol equivalents.
*
* a{0,} -> a*
* a{1,} -> a+
* a{1} -> a
*
* NOTE: the following is automatically handled in the generator:
*
* a{3,3} -> a{3}
*/
module.exports = {
Quantifier: function Quantifier(path) {
var node = path.node;
if (node.kind !== 'Range') {
return;
}
// a{0,} -> a*
rewriteOpenZero(path);
// a{1,} -> a+
rewriteOpenOne(path);
// a{1} -> a
rewriteExactOne(path);
}
};
function rewriteOpenZero(path) {
var node = path.node;
if (node.from !== 0 || node.to) {
return;
}
node.kind = '*';
delete node.from;
}
function rewriteOpenOne(path) {
var node = path.node;
if (node.from !== 1 || node.to) {
return;
}
node.kind = '+';
delete node.from;
}
function rewriteExactOne(path) {
var node = path.node;
if (node.from !== 1 || node.to !== 1) {
return;
}
path.parentPath.replace(path.parentPath.node.expression);
}

View File

@ -0,0 +1,113 @@
/**
* The MIT License (MIT)
* Copyright (c) 2017-present Dmitry Soshnikov <dmitry.soshnikov@gmail.com>
*/
'use strict';
var _require = require('../../transform/utils'),
increaseQuantifierByOne = _require.increaseQuantifierByOne;
/**
* A regexp-tree plugin to merge quantifiers
*
* a+a+ -> a{2,}
* a{2}a{3} -> a{5}
* a{1,2}a{2,3} -> a{3,5}
*/
module.exports = {
Repetition: function Repetition(path) {
var node = path.node,
parent = path.parent;
if (parent.type !== 'Alternative' || !path.index) {
return;
}
var previousSibling = path.getPreviousSibling();
if (!previousSibling) {
return;
}
if (previousSibling.node.type === 'Repetition') {
if (!previousSibling.getChild().hasEqualSource(path.getChild())) {
return;
}
var _extractFromTo = extractFromTo(previousSibling.node.quantifier),
previousSiblingFrom = _extractFromTo.from,
previousSiblingTo = _extractFromTo.to;
var _extractFromTo2 = extractFromTo(node.quantifier),
nodeFrom = _extractFromTo2.from,
nodeTo = _extractFromTo2.to;
// It's does not seem reliable to merge quantifiers with different greediness
// when none of both is a greedy open range
if (previousSibling.node.quantifier.greedy !== node.quantifier.greedy && !isGreedyOpenRange(previousSibling.node.quantifier) && !isGreedyOpenRange(node.quantifier)) {
return;
}
// a*a* -> a*
// a*a+ -> a+
// a+a+ -> a{2,}
// a{2}a{4} -> a{6}
// a{1,2}a{2,3} -> a{3,5}
// a{1,}a{2,} -> a{3,}
// a+a{2,} -> a{3,}
// a??a{2,} -> a{2,}
// a*?a{2,} -> a{2,}
// a+?a{2,} -> a{3,}
node.quantifier.kind = 'Range';
node.quantifier.from = previousSiblingFrom + nodeFrom;
if (previousSiblingTo && nodeTo) {
node.quantifier.to = previousSiblingTo + nodeTo;
} else {
delete node.quantifier.to;
}
if (isGreedyOpenRange(previousSibling.node.quantifier) || isGreedyOpenRange(node.quantifier)) {
node.quantifier.greedy = true;
}
previousSibling.remove();
} else {
if (!previousSibling.hasEqualSource(path.getChild())) {
return;
}
increaseQuantifierByOne(node.quantifier);
previousSibling.remove();
}
}
};
function isGreedyOpenRange(quantifier) {
return quantifier.greedy && (quantifier.kind === '+' || quantifier.kind === '*' || quantifier.kind === 'Range' && !quantifier.to);
}
function extractFromTo(quantifier) {
var from = void 0,
to = void 0;
if (quantifier.kind === '*') {
from = 0;
} else if (quantifier.kind === '+') {
from = 1;
} else if (quantifier.kind === '?') {
from = 0;
to = 1;
} else {
from = quantifier.from;
if (quantifier.to) {
to = quantifier.to;
}
}
return { from: from, to: to };
}

View File

@ -0,0 +1,34 @@
/**
* The MIT License (MIT)
* Copyright (c) 2017-present Dmitry Soshnikov <dmitry.soshnikov@gmail.com>
*/
'use strict';
/**
* A regexp-tree plugin to remove non-capturing empty groups.
*
* /(?:)a/ -> /a/
* /a|(?:)/ -> /a|/
*/
module.exports = {
Group: function Group(path) {
var node = path.node,
parent = path.parent;
var childPath = path.getChild();
if (node.capturing || childPath) {
return;
}
if (parent.type === 'Repetition') {
path.getParent().replace(node);
} else if (parent.type !== 'RegExp') {
path.remove();
}
}
};

View File

@ -0,0 +1,55 @@
/**
* The MIT License (MIT)
* Copyright (c) 2017-present Dmitry Soshnikov <dmitry.soshnikov@gmail.com>
*/
'use strict';
/**
* A regexp-tree plugin to remove unnecessary groups.
*
* /(?:a)/ -> /a/
*/
function _toConsumableArray(arr) { if (Array.isArray(arr)) { for (var i = 0, arr2 = Array(arr.length); i < arr.length; i++) { arr2[i] = arr[i]; } return arr2; } else { return Array.from(arr); } }
module.exports = {
Group: function Group(path) {
var node = path.node,
parent = path.parent;
var childPath = path.getChild();
if (node.capturing || !childPath) {
return;
}
// Don't optimize /a(?:b|c)/ to /ab|c/
// but /(?:b|c)/ to /b|c/ is ok
if (childPath.node.type === 'Disjunction' && parent.type !== 'RegExp') {
return;
}
// Don't optimize /(?:ab)+/ to /ab+/
// but /(?:a)+/ to /a+/ is ok
// and /(?:[a-d])+/ to /[a-d]+/ is ok too
if (parent.type === 'Repetition' && childPath.node.type !== 'Char' && childPath.node.type !== 'CharacterClass') {
return;
}
if (childPath.node.type === 'Alternative') {
var parentPath = path.getParent();
if (parentPath.node.type === 'Alternative') {
// /abc(?:def)ghi/ When (?:def) is ungrouped its content must be merged with parent alternative
parentPath.replace({
type: 'Alternative',
expressions: [].concat(_toConsumableArray(parent.expressions.slice(0, path.index)), _toConsumableArray(childPath.node.expressions), _toConsumableArray(parent.expressions.slice(path.index + 1)))
});
}
} else {
path.replace(childPath.node);
}
}
};