4d291bd810
This is a follow-up to https://github.com/vercel/next.js/pull/23588 to update to use a regex lexer to gather the named regex groups instead of attempting to gather them through executing the regex since it can fail to gather the regex groups when they are using specific matching. This also ensures we don't pass the value as a segment when value is defined and it doesn't use a capture group. Additional tests are added to cover these cases and documentation updated to reflect this. Closes: https://github.com/vercel/next.js/issues/23415 ## Bug - [x] Related issues linked using `fixes #number` - [x] Integration tests added ## Documentation / Examples - [x] Make sure the linting passes
951 lines
26 KiB
JavaScript
951 lines
26 KiB
JavaScript
/*
|
|
RegExr: Learn, Build, & Test RegEx
|
|
Copyright (C) 2017 gskinner.com, inc.
|
|
This program is free software: you can redistribute it and/or modify
|
|
it under the terms of the GNU General Public License as published by
|
|
the Free Software Foundation, either version 3 of the License, or
|
|
(at your option) any later version.
|
|
This program is distributed in the hope that it will be useful,
|
|
but WITHOUT ANY WARRANTY; without even the implied warranty of
|
|
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
|
|
GNU General Public License for more details.
|
|
You should have received a copy of the GNU General Public License
|
|
along with this program. If not, see <https://www.gnu.org/licenses/>.
|
|
*/
|
|
|
|
const profile = require('./profile')
|
|
|
|
const Utils = {
|
|
copy: Object.assign,
|
|
}
|
|
|
|
const SUPPORT_MAP_PROPS = {
|
|
// 1 = reverse, 0 - normal
|
|
flags: 1,
|
|
// escape is handled separately
|
|
// escCharCodes is handled separately
|
|
escCharTypes: 1,
|
|
charTypes: 1,
|
|
// unquantifiables not included
|
|
// unicodeScripts not included
|
|
// unicodeCategories not included
|
|
// posixCharClasses not included
|
|
// modes not included
|
|
tokens: 0,
|
|
substTokens: 0,
|
|
// config not included
|
|
// docs not included
|
|
}
|
|
|
|
class ExpressionLexer {
|
|
constructor() {
|
|
this._profile = profile
|
|
this.string = this.token = this.errors = this.captureGroups = this.namedGroups = null
|
|
this._buildSupportMap(this._profile)
|
|
}
|
|
|
|
parse(str) {
|
|
if (!this._profile) {
|
|
return null
|
|
}
|
|
if (str === this.string) {
|
|
return this.token
|
|
}
|
|
|
|
this.token = null
|
|
this._modes = {}
|
|
this.string = str
|
|
this.errors = []
|
|
let capgroups = (this.captureGroups = [])
|
|
let namedgroups = (this.namedGroups = {})
|
|
let brgroups = (this.branchResetGroups = [])
|
|
let groups = [],
|
|
refs = [],
|
|
i = 0,
|
|
l = str.length
|
|
let o,
|
|
c,
|
|
token,
|
|
charset = null
|
|
// previous is the previous token, prv is the previous "active" token (!ignore)
|
|
let prev = null,
|
|
prv = null
|
|
let profile = this._profile,
|
|
unquantifiable = profile.unquantifiable
|
|
let charTypes = profile.charTypes
|
|
let closeIndex = str.lastIndexOf('/')
|
|
|
|
for (let i = closeIndex + 1; i < l; i++) {
|
|
this._modes[str[i]] = true
|
|
}
|
|
|
|
while (i < l) {
|
|
c = str[i]
|
|
|
|
token = { i: i, l: 1, prev: prev, prv: prv, modes: this._modes }
|
|
if (prev) {
|
|
prev.next = token
|
|
} else {
|
|
this.token = token
|
|
}
|
|
|
|
if (i === 0 || i >= closeIndex) {
|
|
this.parseFlag(str, token)
|
|
} else if (c === '(' && !charset) {
|
|
this.parseParen(str, token)
|
|
if (token.close === null) {
|
|
token.depth = groups.length
|
|
groups.push(token)
|
|
}
|
|
if (token.capture) {
|
|
this.addCaptureGroup(token, groups)
|
|
}
|
|
} else if (c === ')' && !charset) {
|
|
token.type = 'groupclose'
|
|
if (groups.length) {
|
|
o = token.open = groups.pop()
|
|
o.close = token
|
|
if (o.type === 'branchreset') {
|
|
brgroups.pop()
|
|
}
|
|
} else {
|
|
token.error = { id: 'groupclose' }
|
|
}
|
|
} else if (c === '[') {
|
|
charset = this.parseSquareBracket(str, token, charset)
|
|
} else if (c === ']' && charset) {
|
|
token.type = 'setclose'
|
|
token.open = charset
|
|
charset.close = token
|
|
charset = null
|
|
} else if (
|
|
c === '+' &&
|
|
prv &&
|
|
prv.clss === 'quant' &&
|
|
profile.tokens.possessive
|
|
) {
|
|
token.type = 'possessive'
|
|
token.related = [prv]
|
|
} else if ((c === '+' || c === '*') && !charset) {
|
|
token.type = charTypes[c]
|
|
token.clss = 'quant'
|
|
token.min = c === '+' ? 1 : 0
|
|
token.max = -1
|
|
} else if (
|
|
c === '{' &&
|
|
!charset &&
|
|
str.substr(i).search(/^{\d+,?\d*}/) !== -1
|
|
) {
|
|
this.parseQuant(str, token)
|
|
} else if (c === '\\') {
|
|
this.parseBackSlash(str, token, charset, closeIndex)
|
|
} else if (c === '?' && !charset) {
|
|
if (!prv || prv.clss !== 'quant') {
|
|
token.type = charTypes[c]
|
|
token.clss = 'quant'
|
|
token.min = 0
|
|
token.max = 1
|
|
} else {
|
|
token.type = 'lazy'
|
|
token.related = [prv]
|
|
}
|
|
} else if (
|
|
c === '-' &&
|
|
charset &&
|
|
prv.code !== undefined &&
|
|
prv.prv &&
|
|
prv.prv.type !== 'range'
|
|
) {
|
|
// this may be the start of a range, but we'll need to validate after the next token.
|
|
token.type = 'range'
|
|
} else {
|
|
this.parseChar(str, token, charset)
|
|
if (!charset && this._modes.x && /\s/.test(c)) {
|
|
token.ignore = true
|
|
token.type = 'ignorews'
|
|
}
|
|
}
|
|
|
|
// post process token:
|
|
// quantifier:
|
|
if (token.clss === 'quant') {
|
|
if (
|
|
!prv ||
|
|
prv.close !== undefined ||
|
|
unquantifiable[prv.type] ||
|
|
(prv.open && unquantifiable[prv.open.type])
|
|
) {
|
|
token.error = { id: 'quanttarg' }
|
|
} else {
|
|
token.related = [prv.open || prv]
|
|
}
|
|
}
|
|
|
|
// reference:
|
|
if (token.group === true) {
|
|
refs.push(token)
|
|
}
|
|
|
|
// conditional:
|
|
let curGroup = groups.length ? groups[groups.length - 1] : null
|
|
if (
|
|
curGroup &&
|
|
(curGroup.type === 'conditional' ||
|
|
curGroup.type === 'conditionalgroup') &&
|
|
token.type === 'alt'
|
|
) {
|
|
if (!curGroup.alt) {
|
|
curGroup.alt = token
|
|
} else {
|
|
token.error = { id: 'extraelse' }
|
|
}
|
|
token.related = [curGroup]
|
|
token.type = 'conditionalelse'
|
|
token.clss = 'special'
|
|
} else if (curGroup && curGroup.type === 'branchreset') {
|
|
// reset group
|
|
curGroup.curGroupNum = curGroup.inGroupNum
|
|
}
|
|
|
|
// range:
|
|
if (prv && prv.type === 'range' && prv.l === 1) {
|
|
this.validateRange(str, token)
|
|
}
|
|
|
|
// js warnings:
|
|
// TODO: this isn't ideal, but I'm hesitant to write a more robust solution for a couple of edge cases.
|
|
if (profile.id === 'js') {
|
|
this.addJSWarnings(token)
|
|
}
|
|
|
|
// general:
|
|
if (token.open && !token.clss) {
|
|
token.clss = token.open.clss
|
|
}
|
|
if (token.error) {
|
|
this.addError(token)
|
|
}
|
|
i += token.l
|
|
prev = token
|
|
if (!token.ignore) {
|
|
prv = token
|
|
}
|
|
}
|
|
|
|
// post processing:
|
|
while (groups.length) {
|
|
this.addError(groups.pop(), { id: 'groupopen' })
|
|
}
|
|
this.matchRefs(refs, capgroups, namedgroups)
|
|
if (charset) {
|
|
this.addError(charset, { id: 'setopen' })
|
|
}
|
|
|
|
return this.token
|
|
}
|
|
|
|
_buildSupportMap(profile) {
|
|
if (profile._supportMap) {
|
|
return
|
|
}
|
|
let map = (profile._supportMap = {}),
|
|
props = SUPPORT_MAP_PROPS,
|
|
n
|
|
for (n in props) {
|
|
this._addToSupportMap(map, profile[n], !!props[n])
|
|
}
|
|
let o = profile.escCharCodes,
|
|
esc = profile.escChars
|
|
for (n in o) {
|
|
map['esc_' + o[n]] = true
|
|
}
|
|
for (n in esc) {
|
|
map['esc_' + esc[n]] = true
|
|
}
|
|
}
|
|
|
|
_addToSupportMap(map, o, rev) {
|
|
if (rev) {
|
|
for (let n in o) {
|
|
map[o[n]] = true
|
|
}
|
|
} else {
|
|
for (let n in o) {
|
|
map[n] = o[n]
|
|
}
|
|
}
|
|
}
|
|
|
|
addError(token, error = token.error) {
|
|
token.error = error
|
|
this.errors.push(token)
|
|
}
|
|
|
|
addJSWarnings(token) {
|
|
if (token.error) {
|
|
return
|
|
}
|
|
if (
|
|
token.type === 'neglookbehind' ||
|
|
token.type === 'poslookbehind' ||
|
|
token.type === 'sticky' ||
|
|
token.type === 'unicode' ||
|
|
token.type == 'dotall' ||
|
|
token.type === 'unicodecat' ||
|
|
token.type === 'unicodescript' ||
|
|
token.type === 'namedgroup'
|
|
) {
|
|
token.error = { id: 'jsfuture', warning: true }
|
|
}
|
|
}
|
|
|
|
addCaptureGroup(token, groups) {
|
|
// it would be nice to make branch reset groups actually highlight all of the groups that share the same number
|
|
// that would require switching to arrays of groups for each group num - requires rearchitecture throughout the app.
|
|
let capgroups = this.captureGroups,
|
|
brgroups = this.branchResetGroups,
|
|
namedgroups = this.namedGroups
|
|
let curGroup = groups.length ? groups[groups.length - 1] : null
|
|
if (brgroups.length) {
|
|
let brgroup = brgroups[brgroups.length - 1]
|
|
token.num = ++brgroup.curGroupNum
|
|
} else {
|
|
token.num = capgroups.length + 1
|
|
}
|
|
if (!capgroups[token.num - 1]) {
|
|
capgroups.push(token)
|
|
}
|
|
if (token.name && !token.error) {
|
|
if (/\d/.test(token.name[0])) {
|
|
token.error = { id: 'badname' }
|
|
} else if (namedgroups[token.name]) {
|
|
token.error = { id: 'dupname' }
|
|
token.related = [namedgroups[token.name]]
|
|
} else {
|
|
namedgroups[token.name] = token
|
|
}
|
|
}
|
|
}
|
|
|
|
getRef(token, str) {
|
|
token.clss = 'ref'
|
|
token.group = true
|
|
token.relIndex = this.captureGroups.length
|
|
token.name = str
|
|
}
|
|
|
|
matchRefs(refs, indexes, names) {
|
|
while (refs.length) {
|
|
let token = refs.pop(),
|
|
name = token.name,
|
|
group = names[name]
|
|
|
|
if (!group && !isNaN(name)) {
|
|
let sign = name[0],
|
|
index =
|
|
parseInt(name) + (sign === '+' || sign === '-' ? token.relIndex : 0)
|
|
if (sign === '-') {
|
|
index++
|
|
}
|
|
group = indexes[index - 1]
|
|
}
|
|
if (group) {
|
|
token.group = group
|
|
token.related = [group]
|
|
token.dir =
|
|
token.i < group.i
|
|
? 1
|
|
: !group.close || token.i < group.close.i
|
|
? 0
|
|
: -1
|
|
} else {
|
|
delete token.group
|
|
delete token.relIndex
|
|
this.refToOctal(token)
|
|
if (token.error) {
|
|
this.errors.push(token.error)
|
|
}
|
|
}
|
|
}
|
|
}
|
|
|
|
refToOctal(token) {
|
|
// PCRE: \# unmatched, \0 \00 \## = octal
|
|
// JS: \# \0 \00 \## = octal
|
|
// PCRE matches \8 \9 to "8" "9"
|
|
// JS: without the u flag \8 \9 match "8" "9" in IE, FF & Chrome, and "\8" "\9" in Safari. We support the former.
|
|
// JS: with the u flag, Chrome & FF throw an esc error, Safari does not.
|
|
|
|
// TODO: handle \0 for PCRE? Would need more testing.
|
|
// TODO: this doesn't handle two digit refs with 8/9 in them. Ex. \18 - not even sure what this is interpreted as.
|
|
let name = token.name,
|
|
profile = this._profile
|
|
if (token.type !== 'numref') {
|
|
// not a simple \4 style reference, so can't decompose into an octal.
|
|
token.error = { id: 'unmatchedref' }
|
|
} else if (
|
|
/^[0-7]{2}$/.test(name) ||
|
|
(profile.config.reftooctalalways && /^[0-7]$/.test(name))
|
|
) {
|
|
// octal
|
|
let next = token.next,
|
|
char = String.fromCharCode(next.code)
|
|
if (
|
|
next.type === 'char' &&
|
|
char >= '0' &&
|
|
char <= '7' &&
|
|
parseInt(name + char, 8) <= 255
|
|
) {
|
|
name += char
|
|
this.mergeNext(token)
|
|
}
|
|
token.code = parseInt(name, 8)
|
|
token.clss = 'esc'
|
|
token.type = 'escoctal'
|
|
delete token.name
|
|
} else if (name === '8' || name === '9') {
|
|
this.parseEscChar(token, name)
|
|
delete token.name
|
|
} else {
|
|
token.error = { id: 'unmatchedref' }
|
|
}
|
|
}
|
|
|
|
mergeNext(token) {
|
|
let next = token.next
|
|
token.next = next.next
|
|
token.next.prev = token
|
|
token.l++
|
|
}
|
|
|
|
parseFlag(str, token) {
|
|
// note that this doesn't deal with misformed patterns or incorrect flags.
|
|
let i = token.i,
|
|
c = str[i]
|
|
if (str[i] === '/') {
|
|
token.type = i === 0 ? 'open' : 'close'
|
|
if (i !== 0) {
|
|
token.related = [this.token]
|
|
this.token.related = [token]
|
|
}
|
|
} else {
|
|
token.type = this._profile.flags[c]
|
|
}
|
|
//token.clear = true;
|
|
}
|
|
|
|
parseChar(str, token, charset) {
|
|
let c = str[token.i]
|
|
token.type = (!charset && this._profile.charTypes[c]) || 'char'
|
|
if (!charset && c === '/') {
|
|
token.error = { id: 'fwdslash' }
|
|
}
|
|
if (token.type === 'char') {
|
|
token.code = c.charCodeAt(0)
|
|
} else if (ExpressionLexer.ANCHOR_TYPES[token.type]) {
|
|
token.clss = 'anchor'
|
|
} else if (token.type === 'dot') {
|
|
token.clss = 'charclass'
|
|
}
|
|
return token
|
|
}
|
|
|
|
parseSquareBracket(str, token, charset) {
|
|
let match
|
|
if (
|
|
this._profile.tokens.posixcharclass &&
|
|
(match = str.substr(token.i).match(/^\[(:|\.)([^\]]*?)\1]/))
|
|
) {
|
|
// posixcharclass: [:alpha:]
|
|
// posixcollseq: [.ch.]
|
|
// currently neither flavor supports posixcollseq, but PCRE does flag as an error:
|
|
// TODO: the expression above currently does not catch [.\].]
|
|
token.l = match[0].length
|
|
token.value = match[2]
|
|
token.clss = 'charclass'
|
|
if (match[1] === ':') {
|
|
token.type = 'posixcharclass'
|
|
if (!this._profile.posixCharClasses[match[2]]) {
|
|
token.error = { id: 'posixcharclassbad' }
|
|
} else if (!charset) {
|
|
token.error = { id: 'posixcharclassnoset' }
|
|
}
|
|
} else {
|
|
token.type = 'posixcollseq'
|
|
// TODO: can this be generalized? Right now, no, because we assign ids that aren't in the profile.
|
|
token.error = { id: 'notsupported' }
|
|
}
|
|
} else if (!charset) {
|
|
// set [a-z] [aeiou]
|
|
// setnot [^a-z]
|
|
token.type = token.clss = 'set'
|
|
if (str[token.i + 1] === '^') {
|
|
token.l++
|
|
token.type += 'not'
|
|
}
|
|
charset = token
|
|
} else {
|
|
// [[] (square bracket inside a set)
|
|
this.parseChar(str, token, charset)
|
|
}
|
|
return charset
|
|
}
|
|
|
|
parseParen(str, token) {
|
|
/*
|
|
core:
|
|
. group:
|
|
. lookahead: ?= ?!
|
|
. noncap: ?:
|
|
PCRE:
|
|
. lookbehind: ?<= ?<!
|
|
. named: ?P<name> ?'name' ?<name>
|
|
. namedref: ?P=name Also: \g'name' \k'name' etc
|
|
. comment: ?#
|
|
. atomic: ?>
|
|
. recursion: ?0 ?R Also: \g<0>
|
|
. define: ?(DEFINE)
|
|
. subroutine: ?1 ?-1 ?&name ?P>name
|
|
conditionalgroup: ?(1)a|b ?(-1)a|b ?(name)a|b
|
|
conditional: ?(?=if)then|else
|
|
mode: ?c-i
|
|
branchreset: ?|
|
|
*/
|
|
|
|
token.clss = token.type = 'group'
|
|
if (str[token.i + 1] !== '?') {
|
|
token.close = null // indicates that it needs a close token.
|
|
token.capture = true
|
|
return token
|
|
}
|
|
|
|
let sub = str.substr(token.i + 2),
|
|
match,
|
|
s = sub[0]
|
|
|
|
if (s === ':') {
|
|
// (?:foo)
|
|
token.type = 'noncapgroup'
|
|
token.close = null
|
|
token.l = 3
|
|
} else if (s === '>') {
|
|
// (?>foo)
|
|
token.type = 'atomic'
|
|
token.close = null
|
|
token.l = 3
|
|
} else if (s === '|') {
|
|
// (?|(a)|(b))
|
|
token.type = 'branchreset'
|
|
token.close = null
|
|
token.l = 3
|
|
token.inGroupNum = token.curGroupNum = this.captureGroups.length
|
|
this.branchResetGroups.push(token)
|
|
} else if (s === '#' && (match = sub.match(/[^)]*\)/))) {
|
|
// (?#foo)
|
|
token.clss = token.type = 'comment'
|
|
token.ignore = true
|
|
token.l = 2 + match[0].length
|
|
} else if (/^(R|0)\)/.test(sub)) {
|
|
// (?R) (?0)
|
|
token.clss = 'ref'
|
|
token.type = 'recursion'
|
|
token.l = 4
|
|
} else if ((match = sub.match(/^P=(\w+)\)/i))) {
|
|
// (?P=name)
|
|
token.type = 'namedref'
|
|
this.getRef(token, match[1])
|
|
token.l = match[0].length + 2
|
|
} else if (/^\(DEFINE\)/.test(sub)) {
|
|
// (?(DEFINE)foo)
|
|
token.type = 'define'
|
|
token.close = null
|
|
token.l = 10
|
|
} else if ((match = sub.match(/^<?[=!]/))) {
|
|
// (?=foo) (?<!foo)
|
|
let isCond = token.prv.type === 'conditional'
|
|
token.clss = isCond ? 'special' : 'lookaround'
|
|
token.close = null
|
|
s = match[0]
|
|
token.behind = s[0] === '<'
|
|
token.negative = s[+token.behind] === '!'
|
|
token.type = isCond
|
|
? 'condition'
|
|
: (token.negative ? 'neg' : 'pos') +
|
|
'look' +
|
|
(token.behind ? 'behind' : 'ahead')
|
|
if (isCond) {
|
|
token.prv.related = [token]
|
|
token.prv.condition = token
|
|
token.related = [token.prv]
|
|
}
|
|
token.l = s.length + 2
|
|
} else if (
|
|
(match = sub.match(/^<(\w+)>/)) ||
|
|
(this._profile.config.namedgroupalt &&
|
|
((match = sub.match(/^'(\w+)'/)) || (match = sub.match(/^P<(\w+)>/))))
|
|
) {
|
|
// (?<name>foo) (?'name'foo) (?P<name>foo)
|
|
token.type = 'namedgroup'
|
|
token.close = null
|
|
token.name = match[1]
|
|
token.capture = true
|
|
token.l = match[0].length + 2
|
|
} else if (
|
|
(match = sub.match(/^([-+]?\d\d?)\)/)) ||
|
|
(match = sub.match(/^(?:&|P>)(\w+)\)/))
|
|
) {
|
|
// (?1) (?-1) (?&name) (?P>name)
|
|
token.type = (isNaN(match[1]) ? 'named' : 'num') + 'subroutine'
|
|
this.getRef(token, match[1])
|
|
token.l = match[0].length + 2
|
|
} else if (
|
|
(match = sub.match(/^\(([-+]?\d\d?)\)/)) ||
|
|
(match = sub.match(/^\((\w+)\)/))
|
|
) {
|
|
// (?(1)a|b) (?(-1)a|b) (?(name)a|b)
|
|
this.getRef(token, match[1])
|
|
token.clss = 'special'
|
|
token.type = 'conditionalgroup'
|
|
token.close = null
|
|
token.l = match[0].length + 2
|
|
} else if (/^\(\?<?[=!]/.test(sub)) {
|
|
// (?(?=if)then|else)
|
|
token.clss = 'special'
|
|
token.type = 'conditional'
|
|
token.close = null
|
|
token.l = 2
|
|
} else if (this.parseMode(token, sub)) {
|
|
// (?i-x)
|
|
// do nothing. handled by parseMode.
|
|
} else {
|
|
// error, found a (? without matching anything. Treat it as a normal group and let it error out.
|
|
token.close = null
|
|
token.capture = true
|
|
}
|
|
|
|
if (!this._profile.tokens[token.type]) {
|
|
token.error = { id: 'notsupported' }
|
|
}
|
|
|
|
return token
|
|
}
|
|
|
|
parseBackSlash(str, token, charset, closeIndex) {
|
|
// Note: Chrome does weird things with \x & \u depending on a number of factors, we ignore this.
|
|
let i = token.i,
|
|
match,
|
|
profile = this._profile
|
|
let sub = str.substr(i + 1),
|
|
c = sub[0],
|
|
val
|
|
if (i + 1 === (closeIndex || str.length)) {
|
|
token.error = { id: 'esccharopen' }
|
|
return
|
|
}
|
|
|
|
if (!charset && (match = sub.match(/^\d\d?/))) {
|
|
// \1 to \99
|
|
// write this as a reference for now, and re-write it later if it doesn't match a group
|
|
token.type = 'numref'
|
|
this.getRef(token, match[0])
|
|
token.l += match[0].length
|
|
return token
|
|
}
|
|
if (profile.tokens.namedref && !charset && (c === 'g' || c === 'k')) {
|
|
return this.parseRef(token, sub)
|
|
}
|
|
|
|
if (
|
|
profile.tokens.unicodecat &&
|
|
(!profile.flags.u || this._modes.u) &&
|
|
(c === 'p' || c === 'P')
|
|
) {
|
|
// unicode: \p{Ll} \pL
|
|
return this.parseUnicode(token, sub)
|
|
} else if (profile.tokens.escsequence && c === 'Q') {
|
|
// escsequence: \Q...\E
|
|
token.type = 'escsequence'
|
|
let e = 2
|
|
if ((i = sub.indexOf('\\E')) !== -1) {
|
|
token.l += i + 2
|
|
e += 2
|
|
} else {
|
|
token.l += closeIndex - token.i - 1
|
|
}
|
|
token.value = str.substr(token.i + 2, token.l - e)
|
|
} else if (
|
|
profile.tokens.escunicodeub &&
|
|
this._modes.u &&
|
|
(match = sub.match(/^u\{(\d+)}/))
|
|
) {
|
|
// unicodeu: \u{0061}
|
|
token.type = 'escunicodeub'
|
|
token.l += match[0].length
|
|
token.code = parseInt(match[1], 16)
|
|
} else if (
|
|
profile.tokens.escunicodeu &&
|
|
(match = sub.match(/^u([\da-fA-F]{4})/))
|
|
) {
|
|
// unicode: \uFFFF
|
|
// update SubstLexer if this changes:
|
|
token.type = 'escunicodeu'
|
|
token.l += match[0].length
|
|
token.code = parseInt(match[1], 16)
|
|
} else if (
|
|
profile.tokens.escunicodexb &&
|
|
(match = sub.match(/^x\{(.*?)}/))
|
|
) {
|
|
// unicode: \x{FFFF}
|
|
token.type = 'escunicodexb'
|
|
token.l += match[0].length
|
|
val = parseInt(match[1], 16)
|
|
// PCRE errors on more than 2 digits (>255). In theory it should allow 4?
|
|
if (isNaN(val) || val > 255 || /[^\da-f]/i.test(match[1])) {
|
|
token.error = { id: 'esccharbad' }
|
|
} else {
|
|
token.code = val
|
|
}
|
|
} else if ((match = sub.match(/^x([\da-fA-F]{0,2})/))) {
|
|
// hex ascii: \xFF
|
|
token.type = 'eschexadecimal'
|
|
token.l += match[0].length
|
|
token.code = parseInt(match[1] || 0, 16)
|
|
} else if ((match = sub.match(/^c([a-zA-Z])?/))) {
|
|
// control char: \cA \cz
|
|
// also handles: \c
|
|
// not supported in JS strings
|
|
token.type = 'esccontrolchar'
|
|
if (match[1]) {
|
|
token.code = match[1].toUpperCase().charCodeAt(0) - 64 // A=65
|
|
token.l += 2
|
|
} else if (profile.config.ctrlcodeerr) {
|
|
token.l++
|
|
token.error = { id: 'esccharbad' }
|
|
} else {
|
|
return this.parseChar(str, token, charset) // this builds the "/" token
|
|
}
|
|
} else if ((match = sub.match(/^[0-7]{1,3}/))) {
|
|
// octal ascii: \011
|
|
token.type = 'escoctal'
|
|
sub = match[0]
|
|
if (parseInt(sub, 8) > 255) {
|
|
sub = sub.substr(0, 2)
|
|
}
|
|
token.l += sub.length
|
|
token.code = parseInt(sub, 8)
|
|
} else if (profile.tokens.escoctalo && (match = sub.match(/^o\{(.*?)}/i))) {
|
|
// \o{377}
|
|
token.type = 'escoctal'
|
|
token.l += match[0].length
|
|
val = parseInt(match[1], 8)
|
|
if (isNaN(val) || val > 255 || /[^0-7]/.test(match[1])) {
|
|
token.error = { id: 'esccharbad' }
|
|
} else {
|
|
token.code = val
|
|
}
|
|
} else {
|
|
// single char
|
|
if ((token.type = profile.escCharTypes[c])) {
|
|
token.l++
|
|
token.clss = ExpressionLexer.ANCHOR_TYPES[token.type]
|
|
? 'anchor'
|
|
: 'charclass'
|
|
return token
|
|
}
|
|
|
|
token.code = profile.escCharCodes[c]
|
|
if (token.code === undefined || token.code === false) {
|
|
// unrecognized.
|
|
return this.parseEscChar(token, c)
|
|
}
|
|
|
|
// update SubstLexer if this changes:
|
|
token.l++
|
|
token.type = 'esc_' + token.code
|
|
}
|
|
token.clss = 'esc'
|
|
return token
|
|
}
|
|
|
|
parseEscChar(token, c) {
|
|
// unrecognized escchar: \u \a \8, etc
|
|
// JS: allowed except if u flag set, Safari still allows \8 \9
|
|
// PCRE: allows \8 \9 but not others // TODO: support?
|
|
let profile = this._profile
|
|
token.l = 2
|
|
if (
|
|
(!profile.badEscChars[c] && profile.tokens.escchar && !this._modes.u) ||
|
|
profile.escChars[c]
|
|
) {
|
|
token.type = 'escchar'
|
|
token.code = c.charCodeAt(0)
|
|
token.clss = 'esc'
|
|
} else {
|
|
token.error = { id: 'esccharbad' }
|
|
}
|
|
}
|
|
|
|
parseRef(token, sub) {
|
|
// namedref: \k<name> \k'name' \k{name} \g{name}
|
|
// namedsubroutine: \g<name> \g'name'
|
|
// numref: \g1 \g+2 \g{2}
|
|
// numsubroutine: \g<-1> \g'1'
|
|
// recursion: \g<0> \g'0'
|
|
let c = sub[0],
|
|
s = '',
|
|
match
|
|
if ((match = sub.match(/^[gk](?:'\w*'|<\w*>|{\w*})/))) {
|
|
s = match[0].substr(2, match[0].length - 3)
|
|
if (c === 'k' && !isNaN(s)) {
|
|
s = ''
|
|
} // TODO: specific error for numeric \k?
|
|
} else if (
|
|
(match = sub.match(/^g(?:({[-+]?\d+}|<[-+]?\d+>|'[-+]?\d+')|([-+]?\d+))/))
|
|
) {
|
|
s =
|
|
match[2] !== undefined
|
|
? match[2]
|
|
: match[1].substr(1, match[1].length - 2)
|
|
}
|
|
let isRef = c === 'k' || !(sub[1] === "'" || sub[1] === '<')
|
|
if (!isRef && s == 0) {
|
|
token.type = 'recursion'
|
|
token.clss = 'ref'
|
|
} else {
|
|
// namedref, extnumref, namedsubroutine, numsubroutine
|
|
token.type =
|
|
(isNaN(s) ? 'named' : (isRef ? 'ext' : '') + 'num') +
|
|
(isRef ? 'ref' : 'subroutine')
|
|
this.getRef(token, s)
|
|
}
|
|
token.l += match ? match[0].length : 1
|
|
}
|
|
|
|
parseUnicode(token, sub) {
|
|
// unicodescript: \p{Cherokee}
|
|
// unicodecat: \p{Ll} \pL
|
|
// not: \P{Ll} \p{^Lu}
|
|
let match = sub.match(/p\{\^?([^}]*)}/i),
|
|
val = match && match[1],
|
|
not = sub[0] === 'P'
|
|
if (!match && (match = sub.match(/[pP]([LMZSNPC])/))) {
|
|
val = match[1]
|
|
} else {
|
|
not = not !== (sub[2] === '^')
|
|
}
|
|
token.l += match ? match[0].length : 1
|
|
token.type = 'unicodecat'
|
|
if (this._profile.unicodeScripts[val]) {
|
|
token.type = 'unicodescript'
|
|
} else if (!this._profile.unicodeCategories[val]) {
|
|
val = null
|
|
}
|
|
if (not) {
|
|
token.type = 'not' + token.type
|
|
}
|
|
if ((!this._profile.config.unicodenegated && sub[2] === '^') || !val) {
|
|
token.error = { id: 'unicodebad' }
|
|
}
|
|
token.value = val
|
|
token.clss = 'charclass'
|
|
return token
|
|
}
|
|
|
|
parseMode(token, sub) {
|
|
// (?i-x)
|
|
// supported modes in PCRE: i-caseinsens, x-freespacing, s-dotall, m-multiline, U-switchlazy, [J-samename]
|
|
let match = sub.match(/^[-a-z]+\)/i)
|
|
if (!match) {
|
|
return
|
|
}
|
|
let supModes = this._profile.modes
|
|
let modes = Utils.copy({}, this._modes),
|
|
bad = false,
|
|
not = false,
|
|
s = match[0],
|
|
c
|
|
token.on = token.off = ''
|
|
|
|
for (let i = 0, l = s.length - 1; i < l; i++) {
|
|
c = s[i]
|
|
if (c === '-') {
|
|
not = true
|
|
continue
|
|
}
|
|
if (!supModes[c]) {
|
|
bad = true
|
|
break
|
|
}
|
|
modes[c] = !not
|
|
|
|
token.on = token.on.replace(c, '')
|
|
if (not) {
|
|
token.off = token.off.replace(c, '')
|
|
token.off += c
|
|
} else {
|
|
token.on += c
|
|
}
|
|
}
|
|
|
|
token.clss = 'special'
|
|
token.type = 'mode'
|
|
token.l = match[0].length + 2
|
|
|
|
if (bad) {
|
|
token.error = { id: 'modebad' }
|
|
token.errmode = c
|
|
} else {
|
|
this._modes = modes
|
|
}
|
|
return token
|
|
}
|
|
|
|
parseQuant(str, token) {
|
|
// quantifier: {0,3} {3} {1,}
|
|
token.type = token.clss = 'quant'
|
|
let i = token.i
|
|
let end = str.indexOf('}', i + 1)
|
|
token.l += end - i
|
|
let arr = str.substring(i + 1, end).split(',')
|
|
token.min = parseInt(arr[0])
|
|
token.max =
|
|
arr[1] === undefined ? token.min : arr[1] === '' ? -1 : parseInt(arr[1])
|
|
if (token.max !== -1 && token.min > token.max) {
|
|
token.error = { id: 'quantrev' }
|
|
}
|
|
return token
|
|
}
|
|
|
|
validateRange(str, end) {
|
|
// char range: [a-z] [\11-\n]
|
|
let next = end,
|
|
token = end.prv,
|
|
prv = token.prv
|
|
if (prv.code === undefined || next.code === undefined) {
|
|
// not a range, rewrite as a char:
|
|
this.parseChar(str, token)
|
|
} else {
|
|
token.clss = 'set'
|
|
if (prv.code > next.code) {
|
|
// this gets added here because parse has already moved to the next token:
|
|
this.errors.push((token.error = { id: 'rangerev' }))
|
|
}
|
|
// preserve as separate tokens, but treat as one in the UI:
|
|
next.proxy = prv.proxy = token
|
|
token.set = [prv, token, next]
|
|
}
|
|
}
|
|
}
|
|
|
|
ExpressionLexer.ANCHOR_TYPES = {
|
|
bof: true,
|
|
eof: true,
|
|
bos: true,
|
|
eos: true,
|
|
abseos: true,
|
|
wordboundary: true,
|
|
notwordboundary: true,
|
|
prevmatchend: true,
|
|
}
|
|
|
|
module.exports = ExpressionLexer
|