JJ Kasper 4d291bd810
Use regex lexer for gathering named groups from has (#23626)
This is a follow-up to to update to use a regex lexer to gather the named regex groups instead of attempting to gather them through executing the regex since it can fail to gather the regex groups when they are using specific matching. This also ensures we don't pass the value as a segment when value is defined and it doesn't use a capture group. Additional tests are added to cover these cases and documentation updated to reflect this. 


## Bug

- [x] Related issues linked using `fixes #number`
- [x] Integration tests added

## Documentation / Examples

- [x] Make sure the linting passes
2021-04-13 12:34:51 +00:00

951 lines
26 KiB

RegExr: Learn, Build, & Test RegEx
Copyright (C) 2017, inc.
This program is free software: you can redistribute it and/or modify
it under the terms of the GNU General Public License as published by
the Free Software Foundation, either version 3 of the License, or
(at your option) any later version.
This program is distributed in the hope that it will be useful,
but WITHOUT ANY WARRANTY; without even the implied warranty of
GNU General Public License for more details.
You should have received a copy of the GNU General Public License
along with this program. If not, see <>.
const profile = require('./profile')
const Utils = {
copy: Object.assign,
// 1 = reverse, 0 - normal
flags: 1,
// escape is handled separately
// escCharCodes is handled separately
escCharTypes: 1,
charTypes: 1,
// unquantifiables not included
// unicodeScripts not included
// unicodeCategories not included
// posixCharClasses not included
// modes not included
tokens: 0,
substTokens: 0,
// config not included
// docs not included
class ExpressionLexer {
constructor() {
this._profile = profile
this.string = this.token = this.errors = this.captureGroups = this.namedGroups = null
parse(str) {
if (!this._profile) {
return null
if (str === this.string) {
return this.token
this.token = null
this._modes = {}
this.string = str
this.errors = []
let capgroups = (this.captureGroups = [])
let namedgroups = (this.namedGroups = {})
let brgroups = (this.branchResetGroups = [])
let groups = [],
refs = [],
i = 0,
l = str.length
let o,
charset = null
// previous is the previous token, prv is the previous "active" token (!ignore)
let prev = null,
prv = null
let profile = this._profile,
unquantifiable = profile.unquantifiable
let charTypes = profile.charTypes
let closeIndex = str.lastIndexOf('/')
for (let i = closeIndex + 1; i < l; i++) {
this._modes[str[i]] = true
while (i < l) {
c = str[i]
token = { i: i, l: 1, prev: prev, prv: prv, modes: this._modes }
if (prev) { = token
} else {
this.token = token
if (i === 0 || i >= closeIndex) {
this.parseFlag(str, token)
} else if (c === '(' && !charset) {
this.parseParen(str, token)
if (token.close === null) {
token.depth = groups.length
if (token.capture) {
this.addCaptureGroup(token, groups)
} else if (c === ')' && !charset) {
token.type = 'groupclose'
if (groups.length) {
o = = groups.pop()
o.close = token
if (o.type === 'branchreset') {
} else {
token.error = { id: 'groupclose' }
} else if (c === '[') {
charset = this.parseSquareBracket(str, token, charset)
} else if (c === ']' && charset) {
token.type = 'setclose' = charset
charset.close = token
charset = null
} else if (
c === '+' &&
prv &&
prv.clss === 'quant' &&
) {
token.type = 'possessive'
token.related = [prv]
} else if ((c === '+' || c === '*') && !charset) {
token.type = charTypes[c]
token.clss = 'quant'
token.min = c === '+' ? 1 : 0
token.max = -1
} else if (
c === '{' &&
!charset &&
str.substr(i).search(/^{\d+,?\d*}/) !== -1
) {
this.parseQuant(str, token)
} else if (c === '\\') {
this.parseBackSlash(str, token, charset, closeIndex)
} else if (c === '?' && !charset) {
if (!prv || prv.clss !== 'quant') {
token.type = charTypes[c]
token.clss = 'quant'
token.min = 0
token.max = 1
} else {
token.type = 'lazy'
token.related = [prv]
} else if (
c === '-' &&
charset &&
prv.code !== undefined &&
prv.prv &&
prv.prv.type !== 'range'
) {
// this may be the start of a range, but we'll need to validate after the next token.
token.type = 'range'
} else {
this.parseChar(str, token, charset)
if (!charset && this._modes.x && /\s/.test(c)) {
token.ignore = true
token.type = 'ignorews'
// post process token:
// quantifier:
if (token.clss === 'quant') {
if (
!prv ||
prv.close !== undefined ||
unquantifiable[prv.type] ||
( && unquantifiable[])
) {
token.error = { id: 'quanttarg' }
} else {
token.related = [ || prv]
// reference:
if ( === true) {
// conditional:
let curGroup = groups.length ? groups[groups.length - 1] : null
if (
curGroup &&
(curGroup.type === 'conditional' ||
curGroup.type === 'conditionalgroup') &&
token.type === 'alt'
) {
if (!curGroup.alt) {
curGroup.alt = token
} else {
token.error = { id: 'extraelse' }
token.related = [curGroup]
token.type = 'conditionalelse'
token.clss = 'special'
} else if (curGroup && curGroup.type === 'branchreset') {
// reset group
curGroup.curGroupNum = curGroup.inGroupNum
// range:
if (prv && prv.type === 'range' && prv.l === 1) {
this.validateRange(str, token)
// js warnings:
// TODO: this isn't ideal, but I'm hesitant to write a more robust solution for a couple of edge cases.
if ( === 'js') {
// general:
if ( && !token.clss) {
token.clss =
if (token.error) {
i += token.l
prev = token
if (!token.ignore) {
prv = token
// post processing:
while (groups.length) {
this.addError(groups.pop(), { id: 'groupopen' })
this.matchRefs(refs, capgroups, namedgroups)
if (charset) {
this.addError(charset, { id: 'setopen' })
return this.token
_buildSupportMap(profile) {
if (profile._supportMap) {
let map = (profile._supportMap = {}),
for (n in props) {
this._addToSupportMap(map, profile[n], !!props[n])
let o = profile.escCharCodes,
esc = profile.escChars
for (n in o) {
map['esc_' + o[n]] = true
for (n in esc) {
map['esc_' + esc[n]] = true
_addToSupportMap(map, o, rev) {
if (rev) {
for (let n in o) {
map[o[n]] = true
} else {
for (let n in o) {
map[n] = o[n]
addError(token, error = token.error) {
token.error = error
addJSWarnings(token) {
if (token.error) {
if (
token.type === 'neglookbehind' ||
token.type === 'poslookbehind' ||
token.type === 'sticky' ||
token.type === 'unicode' ||
token.type == 'dotall' ||
token.type === 'unicodecat' ||
token.type === 'unicodescript' ||
token.type === 'namedgroup'
) {
token.error = { id: 'jsfuture', warning: true }
addCaptureGroup(token, groups) {
// it would be nice to make branch reset groups actually highlight all of the groups that share the same number
// that would require switching to arrays of groups for each group num - requires rearchitecture throughout the app.
let capgroups = this.captureGroups,
brgroups = this.branchResetGroups,
namedgroups = this.namedGroups
let curGroup = groups.length ? groups[groups.length - 1] : null
if (brgroups.length) {
let brgroup = brgroups[brgroups.length - 1]
token.num = ++brgroup.curGroupNum
} else {
token.num = capgroups.length + 1
if (!capgroups[token.num - 1]) {
if ( && !token.error) {
if (/\d/.test([0])) {
token.error = { id: 'badname' }
} else if (namedgroups[]) {
token.error = { id: 'dupname' }
token.related = [namedgroups[]]
} else {
namedgroups[] = token
getRef(token, str) {
token.clss = 'ref' = true
token.relIndex = this.captureGroups.length = str
matchRefs(refs, indexes, names) {
while (refs.length) {
let token = refs.pop(),
name =,
group = names[name]
if (!group && !isNaN(name)) {
let sign = name[0],
index =
parseInt(name) + (sign === '+' || sign === '-' ? token.relIndex : 0)
if (sign === '-') {
group = indexes[index - 1]
if (group) { = group
token.related = [group]
token.dir =
token.i < group.i
? 1
: !group.close || token.i < group.close.i
? 0
: -1
} else {
delete token.relIndex
if (token.error) {
refToOctal(token) {
// PCRE: \# unmatched, \0 \00 \## = octal
// JS: \# \0 \00 \## = octal
// PCRE matches \8 \9 to "8" "9"
// JS: without the u flag \8 \9 match "8" "9" in IE, FF & Chrome, and "\8" "\9" in Safari. We support the former.
// JS: with the u flag, Chrome & FF throw an esc error, Safari does not.
// TODO: handle \0 for PCRE? Would need more testing.
// TODO: this doesn't handle two digit refs with 8/9 in them. Ex. \18 - not even sure what this is interpreted as.
let name =,
profile = this._profile
if (token.type !== 'numref') {
// not a simple \4 style reference, so can't decompose into an octal.
token.error = { id: 'unmatchedref' }
} else if (
/^[0-7]{2}$/.test(name) ||
(profile.config.reftooctalalways && /^[0-7]$/.test(name))
) {
// octal
let next =,
char = String.fromCharCode(next.code)
if (
next.type === 'char' &&
char >= '0' &&
char <= '7' &&
parseInt(name + char, 8) <= 255
) {
name += char
token.code = parseInt(name, 8)
token.clss = 'esc'
token.type = 'escoctal'
} else if (name === '8' || name === '9') {
this.parseEscChar(token, name)
} else {
token.error = { id: 'unmatchedref' }
mergeNext(token) {
let next = = = token
parseFlag(str, token) {
// note that this doesn't deal with misformed patterns or incorrect flags.
let i = token.i,
c = str[i]
if (str[i] === '/') {
token.type = i === 0 ? 'open' : 'close'
if (i !== 0) {
token.related = [this.token]
this.token.related = [token]
} else {
token.type = this._profile.flags[c]
//token.clear = true;
parseChar(str, token, charset) {
let c = str[token.i]
token.type = (!charset && this._profile.charTypes[c]) || 'char'
if (!charset && c === '/') {
token.error = { id: 'fwdslash' }
if (token.type === 'char') {
token.code = c.charCodeAt(0)
} else if (ExpressionLexer.ANCHOR_TYPES[token.type]) {
token.clss = 'anchor'
} else if (token.type === 'dot') {
token.clss = 'charclass'
return token
parseSquareBracket(str, token, charset) {
let match
if (
this._profile.tokens.posixcharclass &&
(match = str.substr(token.i).match(/^\[(:|\.)([^\]]*?)\1]/))
) {
// posixcharclass: [:alpha:]
// posixcollseq: [.ch.]
// currently neither flavor supports posixcollseq, but PCRE does flag as an error:
// TODO: the expression above currently does not catch [.\].]
token.l = match[0].length
token.value = match[2]
token.clss = 'charclass'
if (match[1] === ':') {
token.type = 'posixcharclass'
if (!this._profile.posixCharClasses[match[2]]) {
token.error = { id: 'posixcharclassbad' }
} else if (!charset) {
token.error = { id: 'posixcharclassnoset' }
} else {
token.type = 'posixcollseq'
// TODO: can this be generalized? Right now, no, because we assign ids that aren't in the profile.
token.error = { id: 'notsupported' }
} else if (!charset) {
// set [a-z] [aeiou]
// setnot [^a-z]
token.type = token.clss = 'set'
if (str[token.i + 1] === '^') {
token.type += 'not'
charset = token
} else {
// [[] (square bracket inside a set)
this.parseChar(str, token, charset)
return charset
parseParen(str, token) {
. group:
. lookahead: ?= ?!
. noncap: ?:
. lookbehind: ?<= ?<!
. named: ?P<name> ?'name' ?<name>
. namedref: ?P=name Also: \g'name' \k'name' etc
. comment: ?#
. atomic: ?>
. recursion: ?0 ?R Also: \g<0>
. define: ?(DEFINE)
. subroutine: ?1 ?-1 ?&name ?P>name
conditionalgroup: ?(1)a|b ?(-1)a|b ?(name)a|b
conditional: ?(?=if)then|else
mode: ?c-i
branchreset: ?|
token.clss = token.type = 'group'
if (str[token.i + 1] !== '?') {
token.close = null // indicates that it needs a close token.
token.capture = true
return token
let sub = str.substr(token.i + 2),
s = sub[0]
if (s === ':') {
// (?:foo)
token.type = 'noncapgroup'
token.close = null
token.l = 3
} else if (s === '>') {
// (?>foo)
token.type = 'atomic'
token.close = null
token.l = 3
} else if (s === '|') {
// (?|(a)|(b))
token.type = 'branchreset'
token.close = null
token.l = 3
token.inGroupNum = token.curGroupNum = this.captureGroups.length
} else if (s === '#' && (match = sub.match(/[^)]*\)/))) {
// (?#foo)
token.clss = token.type = 'comment'
token.ignore = true
token.l = 2 + match[0].length
} else if (/^(R|0)\)/.test(sub)) {
// (?R) (?0)
token.clss = 'ref'
token.type = 'recursion'
token.l = 4
} else if ((match = sub.match(/^P=(\w+)\)/i))) {
// (?P=name)
token.type = 'namedref'
this.getRef(token, match[1])
token.l = match[0].length + 2
} else if (/^\(DEFINE\)/.test(sub)) {
// (?(DEFINE)foo)
token.type = 'define'
token.close = null
token.l = 10
} else if ((match = sub.match(/^<?[=!]/))) {
// (?=foo) (?<!foo)
let isCond = token.prv.type === 'conditional'
token.clss = isCond ? 'special' : 'lookaround'
token.close = null
s = match[0]
token.behind = s[0] === '<'
token.negative = s[+token.behind] === '!'
token.type = isCond
? 'condition'
: (token.negative ? 'neg' : 'pos') +
'look' +
(token.behind ? 'behind' : 'ahead')
if (isCond) {
token.prv.related = [token]
token.prv.condition = token
token.related = [token.prv]
token.l = s.length + 2
} else if (
(match = sub.match(/^<(\w+)>/)) ||
(this._profile.config.namedgroupalt &&
((match = sub.match(/^'(\w+)'/)) || (match = sub.match(/^P<(\w+)>/))))
) {
// (?<name>foo) (?'name'foo) (?P<name>foo)
token.type = 'namedgroup'
token.close = null = match[1]
token.capture = true
token.l = match[0].length + 2
} else if (
(match = sub.match(/^([-+]?\d\d?)\)/)) ||
(match = sub.match(/^(?:&|P>)(\w+)\)/))
) {
// (?1) (?-1) (?&name) (?P>name)
token.type = (isNaN(match[1]) ? 'named' : 'num') + 'subroutine'
this.getRef(token, match[1])
token.l = match[0].length + 2
} else if (
(match = sub.match(/^\(([-+]?\d\d?)\)/)) ||
(match = sub.match(/^\((\w+)\)/))
) {
// (?(1)a|b) (?(-1)a|b) (?(name)a|b)
this.getRef(token, match[1])
token.clss = 'special'
token.type = 'conditionalgroup'
token.close = null
token.l = match[0].length + 2
} else if (/^\(\?<?[=!]/.test(sub)) {
// (?(?=if)then|else)
token.clss = 'special'
token.type = 'conditional'
token.close = null
token.l = 2
} else if (this.parseMode(token, sub)) {
// (?i-x)
// do nothing. handled by parseMode.
} else {
// error, found a (? without matching anything. Treat it as a normal group and let it error out.
token.close = null
token.capture = true
if (!this._profile.tokens[token.type]) {
token.error = { id: 'notsupported' }
return token
parseBackSlash(str, token, charset, closeIndex) {
// Note: Chrome does weird things with \x & \u depending on a number of factors, we ignore this.
let i = token.i,
profile = this._profile
let sub = str.substr(i + 1),
c = sub[0],
if (i + 1 === (closeIndex || str.length)) {
token.error = { id: 'esccharopen' }
if (!charset && (match = sub.match(/^\d\d?/))) {
// \1 to \99
// write this as a reference for now, and re-write it later if it doesn't match a group
token.type = 'numref'
this.getRef(token, match[0])
token.l += match[0].length
return token
if (profile.tokens.namedref && !charset && (c === 'g' || c === 'k')) {
return this.parseRef(token, sub)
if (
profile.tokens.unicodecat &&
(!profile.flags.u || this._modes.u) &&
(c === 'p' || c === 'P')
) {
// unicode: \p{Ll} \pL
return this.parseUnicode(token, sub)
} else if (profile.tokens.escsequence && c === 'Q') {
// escsequence: \Q...\E
token.type = 'escsequence'
let e = 2
if ((i = sub.indexOf('\\E')) !== -1) {
token.l += i + 2
e += 2
} else {
token.l += closeIndex - token.i - 1
token.value = str.substr(token.i + 2, token.l - e)
} else if (
profile.tokens.escunicodeub &&
this._modes.u &&
(match = sub.match(/^u\{(\d+)}/))
) {
// unicodeu: \u{0061}
token.type = 'escunicodeub'
token.l += match[0].length
token.code = parseInt(match[1], 16)
} else if (
profile.tokens.escunicodeu &&
(match = sub.match(/^u([\da-fA-F]{4})/))
) {
// unicode: \uFFFF
// update SubstLexer if this changes:
token.type = 'escunicodeu'
token.l += match[0].length
token.code = parseInt(match[1], 16)
} else if (
profile.tokens.escunicodexb &&
(match = sub.match(/^x\{(.*?)}/))
) {
// unicode: \x{FFFF}
token.type = 'escunicodexb'
token.l += match[0].length
val = parseInt(match[1], 16)
// PCRE errors on more than 2 digits (>255). In theory it should allow 4?
if (isNaN(val) || val > 255 || /[^\da-f]/i.test(match[1])) {
token.error = { id: 'esccharbad' }
} else {
token.code = val
} else if ((match = sub.match(/^x([\da-fA-F]{0,2})/))) {
// hex ascii: \xFF
token.type = 'eschexadecimal'
token.l += match[0].length
token.code = parseInt(match[1] || 0, 16)
} else if ((match = sub.match(/^c([a-zA-Z])?/))) {
// control char: \cA \cz
// also handles: \c
// not supported in JS strings
token.type = 'esccontrolchar'
if (match[1]) {
token.code = match[1].toUpperCase().charCodeAt(0) - 64 // A=65
token.l += 2
} else if (profile.config.ctrlcodeerr) {
token.error = { id: 'esccharbad' }
} else {
return this.parseChar(str, token, charset) // this builds the "/" token
} else if ((match = sub.match(/^[0-7]{1,3}/))) {
// octal ascii: \011
token.type = 'escoctal'
sub = match[0]
if (parseInt(sub, 8) > 255) {
sub = sub.substr(0, 2)
token.l += sub.length
token.code = parseInt(sub, 8)
} else if (profile.tokens.escoctalo && (match = sub.match(/^o\{(.*?)}/i))) {
// \o{377}
token.type = 'escoctal'
token.l += match[0].length
val = parseInt(match[1], 8)
if (isNaN(val) || val > 255 || /[^0-7]/.test(match[1])) {
token.error = { id: 'esccharbad' }
} else {
token.code = val
} else {
// single char
if ((token.type = profile.escCharTypes[c])) {
token.clss = ExpressionLexer.ANCHOR_TYPES[token.type]
? 'anchor'
: 'charclass'
return token
token.code = profile.escCharCodes[c]
if (token.code === undefined || token.code === false) {
// unrecognized.
return this.parseEscChar(token, c)
// update SubstLexer if this changes:
token.type = 'esc_' + token.code
token.clss = 'esc'
return token
parseEscChar(token, c) {
// unrecognized escchar: \u \a \8, etc
// JS: allowed except if u flag set, Safari still allows \8 \9
// PCRE: allows \8 \9 but not others // TODO: support?
let profile = this._profile
token.l = 2
if (
(!profile.badEscChars[c] && profile.tokens.escchar && !this._modes.u) ||
) {
token.type = 'escchar'
token.code = c.charCodeAt(0)
token.clss = 'esc'
} else {
token.error = { id: 'esccharbad' }
parseRef(token, sub) {
// namedref: \k<name> \k'name' \k{name} \g{name}
// namedsubroutine: \g<name> \g'name'
// numref: \g1 \g+2 \g{2}
// numsubroutine: \g<-1> \g'1'
// recursion: \g<0> \g'0'
let c = sub[0],
s = '',
if ((match = sub.match(/^[gk](?:'\w*'|<\w*>|{\w*})/))) {
s = match[0].substr(2, match[0].length - 3)
if (c === 'k' && !isNaN(s)) {
s = ''
} // TODO: specific error for numeric \k?
} else if (
(match = sub.match(/^g(?:({[-+]?\d+}|<[-+]?\d+>|'[-+]?\d+')|([-+]?\d+))/))
) {
s =
match[2] !== undefined
? match[2]
: match[1].substr(1, match[1].length - 2)
let isRef = c === 'k' || !(sub[1] === "'" || sub[1] === '<')
if (!isRef && s == 0) {
token.type = 'recursion'
token.clss = 'ref'
} else {
// namedref, extnumref, namedsubroutine, numsubroutine
token.type =
(isNaN(s) ? 'named' : (isRef ? 'ext' : '') + 'num') +
(isRef ? 'ref' : 'subroutine')
this.getRef(token, s)
token.l += match ? match[0].length : 1
parseUnicode(token, sub) {
// unicodescript: \p{Cherokee}
// unicodecat: \p{Ll} \pL
// not: \P{Ll} \p{^Lu}
let match = sub.match(/p\{\^?([^}]*)}/i),
val = match && match[1],
not = sub[0] === 'P'
if (!match && (match = sub.match(/[pP]([LMZSNPC])/))) {
val = match[1]
} else {
not = not !== (sub[2] === '^')
token.l += match ? match[0].length : 1
token.type = 'unicodecat'
if (this._profile.unicodeScripts[val]) {
token.type = 'unicodescript'
} else if (!this._profile.unicodeCategories[val]) {
val = null
if (not) {
token.type = 'not' + token.type
if ((!this._profile.config.unicodenegated && sub[2] === '^') || !val) {
token.error = { id: 'unicodebad' }
token.value = val
token.clss = 'charclass'
return token
parseMode(token, sub) {
// (?i-x)
// supported modes in PCRE: i-caseinsens, x-freespacing, s-dotall, m-multiline, U-switchlazy, [J-samename]
let match = sub.match(/^[-a-z]+\)/i)
if (!match) {
let supModes = this._profile.modes
let modes = Utils.copy({}, this._modes),
bad = false,
not = false,
s = match[0],
token.on = = ''
for (let i = 0, l = s.length - 1; i < l; i++) {
c = s[i]
if (c === '-') {
not = true
if (!supModes[c]) {
bad = true
modes[c] = !not
token.on = token.on.replace(c, '')
if (not) { =, '') += c
} else {
token.on += c
token.clss = 'special'
token.type = 'mode'
token.l = match[0].length + 2
if (bad) {
token.error = { id: 'modebad' }
token.errmode = c
} else {
this._modes = modes
return token
parseQuant(str, token) {
// quantifier: {0,3} {3} {1,}
token.type = token.clss = 'quant'
let i = token.i
let end = str.indexOf('}', i + 1)
token.l += end - i
let arr = str.substring(i + 1, end).split(',')
token.min = parseInt(arr[0])
token.max =
arr[1] === undefined ? token.min : arr[1] === '' ? -1 : parseInt(arr[1])
if (token.max !== -1 && token.min > token.max) {
token.error = { id: 'quantrev' }
return token
validateRange(str, end) {
// char range: [a-z] [\11-\n]
let next = end,
token = end.prv,
prv = token.prv
if (prv.code === undefined || next.code === undefined) {
// not a range, rewrite as a char:
this.parseChar(str, token)
} else {
token.clss = 'set'
if (prv.code > next.code) {
// this gets added here because parse has already moved to the next token:
this.errors.push((token.error = { id: 'rangerev' }))
// preserve as separate tokens, but treat as one in the UI:
next.proxy = prv.proxy = token
token.set = [prv, token, next]
ExpressionLexer.ANCHOR_TYPES = {
bof: true,
eof: true,
bos: true,
eos: true,
abseos: true,
wordboundary: true,
notwordboundary: true,
prevmatchend: true,
module.exports = ExpressionLexer