/* RegExr: Learn, Build, & Test RegEx Copyright (C) 2017 gskinner.com, inc. This program is free software: you can redistribute it and/or modify it under the terms of the GNU General Public License as published by the Free Software Foundation, either version 3 of the License, or (at your option) any later version. This program is distributed in the hope that it will be useful, but WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details. You should have received a copy of the GNU General Public License along with this program. If not, see . */ /* The core profile essentially defines every feature we support, and is then pared down by other profiles. All values should be y (true). It also acts in part as pseudo documentation for all of the "type" values. */ let y = true, n = false let core = { id: 'core', flags: { g: 'global', // note that this is not a real flag in some flavors, but a different method call i: 'caseinsensitive', m: 'multiline', s: 'dotall', u: 'unicode', y: 'sticky', x: 'extended', U: 'ungreedy', }, // reserved characters that need to be escaped: escChars: '+*?^$\\.[]{}()|/'.split('').reduce((o, c) => { o[c] = y return o }, {}), // escape chars that are specifically not supported by the flavor: badEscChars: n, escCharCodes: { '0': 0, // null a: 7, // bell t: 9, // tab n: 10, // lf v: 11, // vertical tab f: 12, // form feed r: 13, // cr e: 27, // escape }, escCharTypes: { A: 'bos', b: 'wordboundary', B: 'notwordboundary', d: 'digit', D: 'notdigit', G: 'prevmatchend', h: 'hwhitespace', H: 'nothwhitespace', K: 'keepout', N: 'notlinebreak', R: 'linebreak', s: 'whitespace', S: 'notwhitespace', v: 'vwhitespace', V: 'notvwhitespace', w: 'word', W: 'notword', X: 'unicodegrapheme', Z: 'eos', z: 'abseos', }, charTypes: { '.': 'dot', '|': 'alt', $: 'eof', '^': 'bof', '?': 'opt', // also: "lazy" '+': 'plus', // also: "possessive" '*': 'star', }, unquantifiable: { // all group/set open tokens are unquantifiable by default (ie. tokens with a .close value) quant: y, plus: y, star: y, opt: y, lazy: y, possessive: y, eof: y, bof: y, eos: y, abseos: y, alt: y, open: y, mode: y, comment: y, // TODO: this should actually be ignored by quantifiers. condition: y, }, unicodeScripts: { // from: http://www.pcre.org/original/doc/html/pcrepattern.html Arabic: y, Armenian: y, Avestan: y, Balinese: y, Bamum: y, Bassa_Vah: y, Batak: y, Bengali: y, Bopomofo: y, Brahmi: y, Braille: y, Buginese: y, Buhid: y, Canadian_Aboriginal: y, Carian: y, Caucasian_Albanian: y, Chakma: y, Cham: y, Cherokee: y, Common: y, Coptic: y, Cuneiform: y, Cypriot: y, Cyrillic: y, Deseret: y, Devanagari: y, Duployan: y, Egyptian_Hieroglyphs: y, Elbasan: y, Ethiopic: y, Georgian: y, Glagolitic: y, Gothic: y, Grantha: y, Greek: y, Gujarati: y, Gurmukhi: y, Han: y, Hangul: y, Hanunoo: y, Hebrew: y, Hiragana: y, Imperial_Aramaic: y, Inherited: y, Inscriptional_Pahlavi: y, Inscriptional_Parthian: y, Javanese: y, Kaithi: y, Kannada: y, Katakana: y, Kayah_Li: y, Kharoshthi: y, Khmer: y, Khojki: y, Khudawadi: y, Lao: y, Latin: y, Lepcha: y, Limbu: y, Linear_A: y, Linear_B: y, Lisu: y, Lycian: y, Lydian: y, Mahajani: y, Malayalam: y, Mandaic: y, Manichaean: y, Meetei_Mayek: y, Mende_Kikakui: y, Meroitic_Cursive: y, Meroitic_Hieroglyphs: y, Miao: y, Modi: y, Mongolian: y, Mro: y, Myanmar: y, Nabataean: y, New_Tai_Lue: y, Nko: y, Ogham: y, Ol_Chiki: y, Old_Italic: y, Old_North_Arabian: y, Old_Permic: y, Old_Persian: y, Old_South_Arabian: y, Old_Turkic: y, Oriya: y, Osmanya: y, Pahawh_Hmong: y, Palmyrene: y, Pau_Cin_Hau: y, Phags_Pa: y, Phoenician: y, Psalter_Pahlavi: y, Rejang: y, Runic: y, Samaritan: y, Saurashtra: y, Sharada: y, Shavian: y, Siddham: y, Sinhala: y, Sora_Sompeng: y, Sundanese: y, Syloti_Nagri: y, Syriac: y, Tagalog: y, Tagbanwa: y, Tai_Le: y, Tai_Tham: y, Tai_Viet: y, Takri: y, Tamil: y, Telugu: y, Thaana: y, Thai: y, Tibetan: y, Tifinagh: y, Tirhuta: y, Ugaritic: y, Vai: y, Warang_Citi: y, Yi: y, }, unicodeCategories: { // from: http://www.pcre.org/original/doc/html/pcrepattern.html C: y, // Other Cc: y, // Control Cf: y, // Format Cn: y, // Unassigned Co: y, // Private use Cs: y, // Surrogate L: y, // Letter 'L&': y, // Any letter Ll: y, // Lower case letter Lm: y, // Modifier letter Lo: y, // Other letter Lt: y, // Title case letter Lu: y, // Upper case letter M: y, // Mark Mc: y, // Spacing mark Me: y, // Enclosing mark Mn: y, // Non-spacing mark N: y, // Number Nd: y, // Decimal number Nl: y, // Letter number No: y, // Other number P: y, // Punctuation Pc: y, // Connector punctuation Pd: y, // Dash punctuation Pe: y, // Close punctuation Pf: y, // Final punctuation Pi: y, // Initial punctuation Po: y, // Other punctuation Ps: y, // Open punctuation S: y, // Symbol Sc: y, // Currency symbol Sk: y, // Modifier symbol Sm: y, // Mathematical symbol So: y, // Other symbol Z: y, // Separator Zl: y, // Line separator Zp: y, // Paragraph separator Zs: y, // Space separator }, posixCharClasses: { // from: http://www.pcre.org/original/doc/html/pcrepattern.html alnum: y, // letters and digits alpha: y, // letters ascii: y, // character codes 0 - 127 blank: y, // space or tab only cntrl: y, // control characters digit: y, // decimal digits (same as \d) graph: y, // printing characters, excluding space lower: y, // lower case letters print: y, // printing characters, including space punct: y, // printing characters, excluding letters and digits and space space: y, // white space (the same as \s from PCRE 8.34) upper: y, // upper case letters word: y, // "word" characters (same as \w) xdigit: y, // hexadecimal digits }, modes: { i: 'caseinsensitive', s: 'dotall', m: 'multiline', x: 'freespacing', J: 'samename', U: 'switchlazy', }, tokens: { // note that not all of these are actively used in the lexer, but are included for completeness. open: y, // opening / close: y, // closing / char: y, // abc // classes: // also in escCharTypes and charTypes set: y, // [a-z] setnot: y, // [^a-z] setclose: y, // ] range: y, // [a-z] unicodecat: y, // \p{Ll} \P{^Ll} \pL notunicodecat: y, // \P{Ll} \p{^Ll} \PL unicodescript: y, // \p{Cherokee} \P{^Cherokee} notunicodescript: y, // \P{Cherokee} \p{^Cherokee} posixcharclass: y, // [[:alpha:]] // not in supported flavors: "posixcollseq": y, // [[.foo.]] // this is recognized by the lexer, currently returns "notsupported" error // not in supported flavors: "unicodeblock": y, // \p{InThai} \p{IsThai} and NOT \P // not in supported flavors: "subtract": y, // [base-[subtract]] // not in supported flavors: "intersect": y, // [base&&[intersect]] // esc: // also in escCharCodes and escCharTypes escoctal: y, // \11 escunicodeu: y, // \uFFFF escunicodeub: y, // \u{00A9} escunicodexb: y, // \x{00A9} escsequence: y, // \Q...\E eschexadecimal: y, // \xFF esccontrolchar: y, // \cA escoctalo: y, // \o{377} // resolved to escoctal in lexer, no docs required escchar: y, // \m (unrecognized escapes) // no reference documentation required // group: group: y, // (foo) groupclose: y, // ) noncapgroup: y, // (?:foo) namedgroup: y, // (?Pfoo) (?foo) (?'name'foo) atomic: y, // (?>foo|bar) define: y, // (?(DEFINE)foo) branchreset: y, // (?|(a)|(b)) // lookaround: poslookbehind: y, // (?<=foo) neglookbehind: y, // (? \k'name' \k{name} (?P=name) \g{name} numref: y, // \1 extnumref: y, // \g{-1} \g{+1} \g{1} \g1 \g-1 recursion: y, // (?R) (?0) \g<0> \g'0' numsubroutine: y, // \g<1> \g'-1' (?1) (?-1) namedsubroutine: y, // \g \g'name' (?&name) (?P>name) // quantifiers: // also in specialChars quant: y, // {1,2} possessive: y, // ++ lazy: y, // ? // special: conditional: y, // (?(?=if)then|else) condition: y, // (?=if) any lookaround conditionalelse: y, // | conditionalgroup: y, // (?(1)a|b) (?(-1)a|b) (?(name)a|b) mode: y, // (?i-x) see modes above comment: y, // (?#comment) // meta: matchanyset: y, // [\s\S] }, substTokens: { // named references aren't supported in JS or PCRE / PHP subst_$esc: y, // $$ 'subst_$&match': y, // $& subst_$before: y, // $` subst_$after: y, // $' subst_$group: y, // $1 $99 // resolved to subst_group in lexer, no docs required subst_$bgroup: y, // ${1} ${99} // resolved to subst_group in lexer, no docs required subst_bsgroup: y, // \1 \99 // resolved to subst_group in lexer, no docs required subst_group: y, // $1 \1 \{1} // combined in docs, not used by lexer subst_0match: y, // $0 \0 \{0} // this isn't a feature of the engine, but of RegExr: subst_esc: y, // \n \r \u1234 }, config: { forwardref: y, // \1(a) nestedref: y, // (\1a|b)+ ctrlcodeerr: y, // does \c error? (vs decompose) reftooctalalways: y, // does a single digit reference \1 become an octal? (vs remain an unmatched ref) substdecomposeref: y, // will a subst reference decompose? (ex. \3 becomes "\" & "3" if < 3 groups) looseesc: y, // should unrecognized escape sequences match the character (ex. \u could match "u") // disabled when `u` flag is set unicodenegated: y, // \p{^etc}" namedgroupalt: y, // if false, only support (?foo) }, docs: { // for example: //possessive: {desc: "+This will be appended to the existing entry." }, //namedgroup: {tip: "This will overwrite the existing entry." } }, } module.exports = core