rsnext/packages/next/lib/regexr/profile/core.js
JJ Kasper 4d291bd810
Use regex lexer for gathering named groups from has (#23626)
This is a follow-up to https://github.com/vercel/next.js/pull/23588 to update to use a regex lexer to gather the named regex groups instead of attempting to gather them through executing the regex since it can fail to gather the regex groups when they are using specific matching. This also ensures we don't pass the value as a segment when value is defined and it doesn't use a capture group. Additional tests are added to cover these cases and documentation updated to reflect this. 

Closes: https://github.com/vercel/next.js/issues/23415

## Bug

- [x] Related issues linked using `fixes #number`
- [x] Integration tests added

## Documentation / Examples

- [x] Make sure the linting passes
2021-04-13 12:34:51 +00:00

420 lines
11 KiB
JavaScript

/*
RegExr: Learn, Build, & Test RegEx
Copyright (C) 2017 gskinner.com, inc.
This program is free software: you can redistribute it and/or modify
it under the terms of the GNU General Public License as published by
the Free Software Foundation, either version 3 of the License, or
(at your option) any later version.
This program is distributed in the hope that it will be useful,
but WITHOUT ANY WARRANTY; without even the implied warranty of
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
GNU General Public License for more details.
You should have received a copy of the GNU General Public License
along with this program. If not, see <https://www.gnu.org/licenses/>.
*/
/*
The core profile essentially defines every feature we support, and is then pared down by other profiles. All values should be y (true).
It also acts in part as pseudo documentation for all of the "type" values.
*/
let y = true,
n = false
let core = {
id: 'core',
flags: {
g: 'global', // note that this is not a real flag in some flavors, but a different method call
i: 'caseinsensitive',
m: 'multiline',
s: 'dotall',
u: 'unicode',
y: 'sticky',
x: 'extended',
U: 'ungreedy',
},
// reserved characters that need to be escaped:
escChars: '+*?^$\\.[]{}()|/'.split('').reduce((o, c) => {
o[c] = y
return o
}, {}),
// escape chars that are specifically not supported by the flavor:
badEscChars: n,
escCharCodes: {
'0': 0, // null
a: 7, // bell
t: 9, // tab
n: 10, // lf
v: 11, // vertical tab
f: 12, // form feed
r: 13, // cr
e: 27, // escape
},
escCharTypes: {
A: 'bos',
b: 'wordboundary',
B: 'notwordboundary',
d: 'digit',
D: 'notdigit',
G: 'prevmatchend',
h: 'hwhitespace',
H: 'nothwhitespace',
K: 'keepout',
N: 'notlinebreak',
R: 'linebreak',
s: 'whitespace',
S: 'notwhitespace',
v: 'vwhitespace',
V: 'notvwhitespace',
w: 'word',
W: 'notword',
X: 'unicodegrapheme',
Z: 'eos',
z: 'abseos',
},
charTypes: {
'.': 'dot',
'|': 'alt',
$: 'eof',
'^': 'bof',
'?': 'opt', // also: "lazy"
'+': 'plus', // also: "possessive"
'*': 'star',
},
unquantifiable: {
// all group/set open tokens are unquantifiable by default (ie. tokens with a .close value)
quant: y,
plus: y,
star: y,
opt: y,
lazy: y,
possessive: y,
eof: y,
bof: y,
eos: y,
abseos: y,
alt: y,
open: y,
mode: y,
comment: y, // TODO: this should actually be ignored by quantifiers.
condition: y,
},
unicodeScripts: {
// from: http://www.pcre.org/original/doc/html/pcrepattern.html
Arabic: y,
Armenian: y,
Avestan: y,
Balinese: y,
Bamum: y,
Bassa_Vah: y,
Batak: y,
Bengali: y,
Bopomofo: y,
Brahmi: y,
Braille: y,
Buginese: y,
Buhid: y,
Canadian_Aboriginal: y,
Carian: y,
Caucasian_Albanian: y,
Chakma: y,
Cham: y,
Cherokee: y,
Common: y,
Coptic: y,
Cuneiform: y,
Cypriot: y,
Cyrillic: y,
Deseret: y,
Devanagari: y,
Duployan: y,
Egyptian_Hieroglyphs: y,
Elbasan: y,
Ethiopic: y,
Georgian: y,
Glagolitic: y,
Gothic: y,
Grantha: y,
Greek: y,
Gujarati: y,
Gurmukhi: y,
Han: y,
Hangul: y,
Hanunoo: y,
Hebrew: y,
Hiragana: y,
Imperial_Aramaic: y,
Inherited: y,
Inscriptional_Pahlavi: y,
Inscriptional_Parthian: y,
Javanese: y,
Kaithi: y,
Kannada: y,
Katakana: y,
Kayah_Li: y,
Kharoshthi: y,
Khmer: y,
Khojki: y,
Khudawadi: y,
Lao: y,
Latin: y,
Lepcha: y,
Limbu: y,
Linear_A: y,
Linear_B: y,
Lisu: y,
Lycian: y,
Lydian: y,
Mahajani: y,
Malayalam: y,
Mandaic: y,
Manichaean: y,
Meetei_Mayek: y,
Mende_Kikakui: y,
Meroitic_Cursive: y,
Meroitic_Hieroglyphs: y,
Miao: y,
Modi: y,
Mongolian: y,
Mro: y,
Myanmar: y,
Nabataean: y,
New_Tai_Lue: y,
Nko: y,
Ogham: y,
Ol_Chiki: y,
Old_Italic: y,
Old_North_Arabian: y,
Old_Permic: y,
Old_Persian: y,
Old_South_Arabian: y,
Old_Turkic: y,
Oriya: y,
Osmanya: y,
Pahawh_Hmong: y,
Palmyrene: y,
Pau_Cin_Hau: y,
Phags_Pa: y,
Phoenician: y,
Psalter_Pahlavi: y,
Rejang: y,
Runic: y,
Samaritan: y,
Saurashtra: y,
Sharada: y,
Shavian: y,
Siddham: y,
Sinhala: y,
Sora_Sompeng: y,
Sundanese: y,
Syloti_Nagri: y,
Syriac: y,
Tagalog: y,
Tagbanwa: y,
Tai_Le: y,
Tai_Tham: y,
Tai_Viet: y,
Takri: y,
Tamil: y,
Telugu: y,
Thaana: y,
Thai: y,
Tibetan: y,
Tifinagh: y,
Tirhuta: y,
Ugaritic: y,
Vai: y,
Warang_Citi: y,
Yi: y,
},
unicodeCategories: {
// from: http://www.pcre.org/original/doc/html/pcrepattern.html
C: y, // Other
Cc: y, // Control
Cf: y, // Format
Cn: y, // Unassigned
Co: y, // Private use
Cs: y, // Surrogate
L: y, // Letter
'L&': y, // Any letter
Ll: y, // Lower case letter
Lm: y, // Modifier letter
Lo: y, // Other letter
Lt: y, // Title case letter
Lu: y, // Upper case letter
M: y, // Mark
Mc: y, // Spacing mark
Me: y, // Enclosing mark
Mn: y, // Non-spacing mark
N: y, // Number
Nd: y, // Decimal number
Nl: y, // Letter number
No: y, // Other number
P: y, // Punctuation
Pc: y, // Connector punctuation
Pd: y, // Dash punctuation
Pe: y, // Close punctuation
Pf: y, // Final punctuation
Pi: y, // Initial punctuation
Po: y, // Other punctuation
Ps: y, // Open punctuation
S: y, // Symbol
Sc: y, // Currency symbol
Sk: y, // Modifier symbol
Sm: y, // Mathematical symbol
So: y, // Other symbol
Z: y, // Separator
Zl: y, // Line separator
Zp: y, // Paragraph separator
Zs: y, // Space separator
},
posixCharClasses: {
// from: http://www.pcre.org/original/doc/html/pcrepattern.html
alnum: y, // letters and digits
alpha: y, // letters
ascii: y, // character codes 0 - 127
blank: y, // space or tab only
cntrl: y, // control characters
digit: y, // decimal digits (same as \d)
graph: y, // printing characters, excluding space
lower: y, // lower case letters
print: y, // printing characters, including space
punct: y, // printing characters, excluding letters and digits and space
space: y, // white space (the same as \s from PCRE 8.34)
upper: y, // upper case letters
word: y, // "word" characters (same as \w)
xdigit: y, // hexadecimal digits
},
modes: {
i: 'caseinsensitive',
s: 'dotall',
m: 'multiline',
x: 'freespacing',
J: 'samename',
U: 'switchlazy',
},
tokens: {
// note that not all of these are actively used in the lexer, but are included for completeness.
open: y, // opening /
close: y, // closing /
char: y, // abc
// classes:
// also in escCharTypes and charTypes
set: y, // [a-z]
setnot: y, // [^a-z]
setclose: y, // ]
range: y, // [a-z]
unicodecat: y, // \p{Ll} \P{^Ll} \pL
notunicodecat: y, // \P{Ll} \p{^Ll} \PL
unicodescript: y, // \p{Cherokee} \P{^Cherokee}
notunicodescript: y, // \P{Cherokee} \p{^Cherokee}
posixcharclass: y, // [[:alpha:]]
// not in supported flavors: "posixcollseq": y, // [[.foo.]] // this is recognized by the lexer, currently returns "notsupported" error
// not in supported flavors: "unicodeblock": y, // \p{InThai} \p{IsThai} and NOT \P
// not in supported flavors: "subtract": y, // [base-[subtract]]
// not in supported flavors: "intersect": y, // [base&&[intersect]]
// esc:
// also in escCharCodes and escCharTypes
escoctal: y, // \11
escunicodeu: y, // \uFFFF
escunicodeub: y, // \u{00A9}
escunicodexb: y, // \x{00A9}
escsequence: y, // \Q...\E
eschexadecimal: y, // \xFF
esccontrolchar: y, // \cA
escoctalo: y, // \o{377} // resolved to escoctal in lexer, no docs required
escchar: y, // \m (unrecognized escapes) // no reference documentation required
// group:
group: y, // (foo)
groupclose: y, // )
noncapgroup: y, // (?:foo)
namedgroup: y, // (?P<name>foo) (?<name>foo) (?'name'foo)
atomic: y, // (?>foo|bar)
define: y, // (?(DEFINE)foo)
branchreset: y, // (?|(a)|(b))
// lookaround:
poslookbehind: y, // (?<=foo)
neglookbehind: y, // (?<!foo)
poslookahead: y, // (?=foo)
neglookahead: y, // (?!foo)
// ref:
namedref: y, // \k<name> \k'name' \k{name} (?P=name) \g{name}
numref: y, // \1
extnumref: y, // \g{-1} \g{+1} \g{1} \g1 \g-1
recursion: y, // (?R) (?0) \g<0> \g'0'
numsubroutine: y, // \g<1> \g'-1' (?1) (?-1)
namedsubroutine: y, // \g<name> \g'name' (?&name) (?P>name)
// quantifiers:
// also in specialChars
quant: y, // {1,2}
possessive: y, // ++
lazy: y, // ?
// special:
conditional: y, // (?(?=if)then|else)
condition: y, // (?=if) any lookaround
conditionalelse: y, // |
conditionalgroup: y, // (?(1)a|b) (?(-1)a|b) (?(name)a|b)
mode: y, // (?i-x) see modes above
comment: y, // (?#comment)
// meta:
matchanyset: y, // [\s\S]
},
substTokens: {
// named references aren't supported in JS or PCRE / PHP
subst_$esc: y, // $$
'subst_$&match': y, // $&
subst_$before: y, // $`
subst_$after: y, // $'
subst_$group: y, // $1 $99 // resolved to subst_group in lexer, no docs required
subst_$bgroup: y, // ${1} ${99} // resolved to subst_group in lexer, no docs required
subst_bsgroup: y, // \1 \99 // resolved to subst_group in lexer, no docs required
subst_group: y, // $1 \1 \{1} // combined in docs, not used by lexer
subst_0match: y, // $0 \0 \{0}
// this isn't a feature of the engine, but of RegExr:
subst_esc: y, // \n \r \u1234
},
config: {
forwardref: y, // \1(a)
nestedref: y, // (\1a|b)+
ctrlcodeerr: y, // does \c error? (vs decompose)
reftooctalalways: y, // does a single digit reference \1 become an octal? (vs remain an unmatched ref)
substdecomposeref: y, // will a subst reference decompose? (ex. \3 becomes "\" & "3" if < 3 groups)
looseesc: y, // should unrecognized escape sequences match the character (ex. \u could match "u") // disabled when `u` flag is set
unicodenegated: y, // \p{^etc}"
namedgroupalt: y, // if false, only support (?<name>foo)
},
docs: {
// for example:
//possessive: {desc: "+This will be appended to the existing entry." },
//namedgroup: {tip: "This will overwrite the existing entry." }
},
}
module.exports = core