123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282 |
- var util = require('./util');
- var types = require('./types');
- var sets = require('./sets');
- var positions = require('./positions');
- module.exports = function(regexpStr) {
- var i = 0, l, c,
- start = { type: types.ROOT, stack: []},
- // Keep track of last clause/group and stack.
- lastGroup = start,
- last = start.stack,
- groupStack = [];
- var repeatErr = function(i) {
- util.error(regexpStr, 'Nothing to repeat at column ' + (i - 1));
- };
- // Decode a few escaped characters.
- var str = util.strToChars(regexpStr);
- l = str.length;
- // Iterate through each character in string.
- while (i < l) {
- c = str[i++];
- switch (c) {
- // Handle escaped characters, inclues a few sets.
- case '\\':
- c = str[i++];
- switch (c) {
- case 'b':
- last.push(positions.wordBoundary());
- break;
- case 'B':
- last.push(positions.nonWordBoundary());
- break;
- case 'w':
- last.push(sets.words());
- break;
- case 'W':
- last.push(sets.notWords());
- break;
- case 'd':
- last.push(sets.ints());
- break;
- case 'D':
- last.push(sets.notInts());
- break;
- case 's':
- last.push(sets.whitespace());
- break;
- case 'S':
- last.push(sets.notWhitespace());
- break;
- default:
- // Check if c is integer.
- // In which case it's a reference.
- if (/\d/.test(c)) {
- last.push({ type: types.REFERENCE, value: parseInt(c, 10) });
- // Escaped character.
- } else {
- last.push({ type: types.CHAR, value: c.charCodeAt(0) });
- }
- }
- break;
- // Positionals.
- case '^':
- last.push(positions.begin());
- break;
- case '$':
- last.push(positions.end());
- break;
- // Handle custom sets.
- case '[':
- // Check if this class is 'anti' i.e. [^abc].
- var not;
- if (str[i] === '^') {
- not = true;
- i++;
- } else {
- not = false;
- }
- // Get all the characters in class.
- var classTokens = util.tokenizeClass(str.slice(i), regexpStr);
- // Increase index by length of class.
- i += classTokens[1];
- last.push({
- type: types.SET,
- set: classTokens[0],
- not: not,
- });
- break;
- // Class of any character except \n.
- case '.':
- last.push(sets.anyChar());
- break;
- // Push group onto stack.
- case '(':
- // Create group.
- var group = {
- type: types.GROUP,
- stack: [],
- remember: true,
- };
- c = str[i];
- // If if this is a special kind of group.
- if (c === '?') {
- c = str[i + 1];
- i += 2;
- // Match if followed by.
- if (c === '=') {
- group.followedBy = true;
- // Match if not followed by.
- } else if (c === '!') {
- group.notFollowedBy = true;
- } else if (c !== ':') {
- util.error(regexpStr,
- 'Invalid group, character \'' + c +
- '\' after \'?\' at column ' + (i - 1));
- }
- group.remember = false;
- }
- // Insert subgroup into current group stack.
- last.push(group);
- // Remember the current group for when the group closes.
- groupStack.push(lastGroup);
- // Make this new group the current group.
- lastGroup = group;
- last = group.stack;
- break;
- // Pop group out of stack.
- case ')':
- if (groupStack.length === 0) {
- util.error(regexpStr, 'Unmatched ) at column ' + (i - 1));
- }
- lastGroup = groupStack.pop();
- // Check if this group has a PIPE.
- // To get back the correct last stack.
- last = lastGroup.options ?
- lastGroup.options[lastGroup.options.length - 1] : lastGroup.stack;
- break;
- // Use pipe character to give more choices.
- case '|':
- // Create array where options are if this is the first PIPE
- // in this clause.
- if (!lastGroup.options) {
- lastGroup.options = [lastGroup.stack];
- delete lastGroup.stack;
- }
- // Create a new stack and add to options for rest of clause.
- var stack = [];
- lastGroup.options.push(stack);
- last = stack;
- break;
- // Repetition.
- // For every repetition, remove last element from last stack
- // then insert back a RANGE object.
- // This design is chosen because there could be more than
- // one repetition symbols in a regex i.e. `a?+{2,3}`.
- case '{':
- var rs = /^(\d+)(,(\d+)?)?\}/.exec(str.slice(i)), min, max;
- if (rs !== null) {
- if (last.length === 0) {
- repeatErr(i);
- }
- min = parseInt(rs[1], 10);
- max = rs[2] ? rs[3] ? parseInt(rs[3], 10) : Infinity : min;
- i += rs[0].length;
- last.push({
- type: types.REPETITION,
- min: min,
- max: max,
- value: last.pop(),
- });
- } else {
- last.push({
- type: types.CHAR,
- value: 123,
- });
- }
- break;
- case '?':
- if (last.length === 0) {
- repeatErr(i);
- }
- last.push({
- type: types.REPETITION,
- min: 0,
- max: 1,
- value: last.pop(),
- });
- break;
- case '+':
- if (last.length === 0) {
- repeatErr(i);
- }
- last.push({
- type: types.REPETITION,
- min: 1,
- max: Infinity,
- value: last.pop(),
- });
- break;
- case '*':
- if (last.length === 0) {
- repeatErr(i);
- }
- last.push({
- type: types.REPETITION,
- min: 0,
- max: Infinity,
- value: last.pop(),
- });
- break;
- // Default is a character that is not `\[](){}?+*^$`.
- default:
- last.push({
- type: types.CHAR,
- value: c.charCodeAt(0),
- });
- }
- }
- // Check if any groups have not been closed.
- if (groupStack.length !== 0) {
- util.error(regexpStr, 'Unterminated group');
- }
- return start;
- };
- module.exports.types = types;
|