rsnext/packages/next/lib/regexr/profile/core.js

/*
RegExr: Learn, Build, & Test RegEx
Copyright (C) 2017  gskinner.com, inc.

This program is free software: you can redistribute it and/or modify
it under the terms of the GNU General Public License as published by
the Free Software Foundation, either version 3 of the License, or
(at your option) any later version.

This program is distributed in the hope that it will be useful,
but WITHOUT ANY WARRANTY; without even the implied warranty of
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
GNU General Public License for more details.

You should have received a copy of the GNU General Public License
along with this program.  If not, see <https://www.gnu.org/licenses/>.
*/

/*
The core profile essentially defines every feature we support, and is then pared down by other profiles. All values should be y (true).

It also acts in part as pseudo documentation for all of the "type" values.
 */
let y = true,
  n = false

let core = {
  id: 'core',

  flags: {
    g: 'global', // note that this is not a real flag in some flavors, but a different method call
    i: 'caseinsensitive',
    m: 'multiline',
    s: 'dotall',
    u: 'unicode',
    y: 'sticky',
    x: 'extended',
    U: 'ungreedy',
  },

  // reserved characters that need to be escaped:
  escChars: '+*?^$\\.[]{}()|/'.split('').reduce((o, c) => {
    o[c] = y
    return o
  }, {}),

  // escape chars that are specifically not supported by the flavor:
  badEscChars: n,

  escCharCodes: {
    '0': 0, // null
    a: 7, // bell
    t: 9, // tab
    n: 10, // lf
    v: 11, // vertical tab
    f: 12, // form feed
    r: 13, // cr
    e: 27, // escape
  },

  escCharTypes: {
    A: 'bos',
    b: 'wordboundary',
    B: 'notwordboundary',
    d: 'digit',
    D: 'notdigit',
    G: 'prevmatchend',
    h: 'hwhitespace',
    H: 'nothwhitespace',
    K: 'keepout',
    N: 'notlinebreak',
    R: 'linebreak',
    s: 'whitespace',
    S: 'notwhitespace',
    v: 'vwhitespace',
    V: 'notvwhitespace',
    w: 'word',
    W: 'notword',
    X: 'unicodegrapheme',
    Z: 'eos',
    z: 'abseos',
  },

  charTypes: {
    '.': 'dot',
    '|': 'alt',
    $: 'eof',
    '^': 'bof',
    '?': 'opt', // also: "lazy"
    '+': 'plus', // also: "possessive"
    '*': 'star',
  },

  unquantifiable: {
    // all group/set open tokens are unquantifiable by default (ie. tokens with a .close value)
    quant: y,
    plus: y,
    star: y,
    opt: y,
    lazy: y,
    possessive: y,
    eof: y,
    bof: y,
    eos: y,
    abseos: y,
    alt: y,
    open: y,
    mode: y,
    comment: y, // TODO: this should actually be ignored by quantifiers.
    condition: y,
  },

  unicodeScripts: {
    // from: http://www.pcre.org/original/doc/html/pcrepattern.html
    Arabic: y,
    Armenian: y,
    Avestan: y,
    Balinese: y,
    Bamum: y,
    Bassa_Vah: y,
    Batak: y,
    Bengali: y,
    Bopomofo: y,
    Brahmi: y,
    Braille: y,
    Buginese: y,
    Buhid: y,
    Canadian_Aboriginal: y,
    Carian: y,
    Caucasian_Albanian: y,
    Chakma: y,
    Cham: y,
    Cherokee: y,
    Common: y,
    Coptic: y,
    Cuneiform: y,
    Cypriot: y,
    Cyrillic: y,
    Deseret: y,
    Devanagari: y,
    Duployan: y,
    Egyptian_Hieroglyphs: y,
    Elbasan: y,
    Ethiopic: y,
    Georgian: y,
    Glagolitic: y,
    Gothic: y,
    Grantha: y,
    Greek: y,
    Gujarati: y,
    Gurmukhi: y,
    Han: y,
    Hangul: y,
    Hanunoo: y,
    Hebrew: y,
    Hiragana: y,
    Imperial_Aramaic: y,
    Inherited: y,
    Inscriptional_Pahlavi: y,
    Inscriptional_Parthian: y,
    Javanese: y,
    Kaithi: y,
    Kannada: y,
    Katakana: y,
    Kayah_Li: y,
    Kharoshthi: y,
    Khmer: y,
    Khojki: y,
    Khudawadi: y,
    Lao: y,
    Latin: y,
    Lepcha: y,
    Limbu: y,
    Linear_A: y,
    Linear_B: y,
    Lisu: y,
    Lycian: y,
    Lydian: y,
    Mahajani: y,
    Malayalam: y,
    Mandaic: y,
    Manichaean: y,
    Meetei_Mayek: y,
    Mende_Kikakui: y,
    Meroitic_Cursive: y,
    Meroitic_Hieroglyphs: y,
    Miao: y,
    Modi: y,
    Mongolian: y,
    Mro: y,
    Myanmar: y,
    Nabataean: y,
    New_Tai_Lue: y,
    Nko: y,
    Ogham: y,
    Ol_Chiki: y,
    Old_Italic: y,
    Old_North_Arabian: y,
    Old_Permic: y,
    Old_Persian: y,
    Old_South_Arabian: y,
    Old_Turkic: y,
    Oriya: y,
    Osmanya: y,
    Pahawh_Hmong: y,
    Palmyrene: y,
    Pau_Cin_Hau: y,
    Phags_Pa: y,
    Phoenician: y,
    Psalter_Pahlavi: y,
    Rejang: y,
    Runic: y,
    Samaritan: y,
    Saurashtra: y,
    Sharada: y,
    Shavian: y,
    Siddham: y,
    Sinhala: y,
    Sora_Sompeng: y,
    Sundanese: y,
    Syloti_Nagri: y,
    Syriac: y,
    Tagalog: y,
    Tagbanwa: y,
    Tai_Le: y,
    Tai_Tham: y,
    Tai_Viet: y,
    Takri: y,
    Tamil: y,
    Telugu: y,
    Thaana: y,
    Thai: y,
    Tibetan: y,
    Tifinagh: y,
    Tirhuta: y,
    Ugaritic: y,
    Vai: y,
    Warang_Citi: y,
    Yi: y,
  },

  unicodeCategories: {
    // from: http://www.pcre.org/original/doc/html/pcrepattern.html
    C: y, // Other
    Cc: y, // Control
    Cf: y, // Format
    Cn: y, // Unassigned
    Co: y, // Private use
    Cs: y, // Surrogate
    L: y, // Letter
    'L&': y, // Any letter
    Ll: y, // Lower case letter
    Lm: y, // Modifier letter
    Lo: y, // Other letter
    Lt: y, // Title case letter
    Lu: y, // Upper case letter
    M: y, // Mark
    Mc: y, // Spacing mark
    Me: y, // Enclosing mark
    Mn: y, // Non-spacing mark
    N: y, // Number
    Nd: y, // Decimal number
    Nl: y, // Letter number
    No: y, // Other number
    P: y, // Punctuation
    Pc: y, // Connector punctuation
    Pd: y, // Dash punctuation
    Pe: y, // Close punctuation
    Pf: y, // Final punctuation
    Pi: y, // Initial punctuation
    Po: y, // Other punctuation
    Ps: y, // Open punctuation
    S: y, // Symbol
    Sc: y, // Currency symbol
    Sk: y, // Modifier symbol
    Sm: y, // Mathematical symbol
    So: y, // Other symbol
    Z: y, // Separator
    Zl: y, // Line separator
    Zp: y, // Paragraph separator
    Zs: y, // Space separator
  },

  posixCharClasses: {
    // from: http://www.pcre.org/original/doc/html/pcrepattern.html
    alnum: y, // letters and digits
    alpha: y, // letters
    ascii: y, // character codes 0 - 127
    blank: y, // space or tab only
    cntrl: y, // control characters
    digit: y, // decimal digits (same as \d)
    graph: y, // printing characters, excluding space
    lower: y, // lower case letters
    print: y, // printing characters, including space
    punct: y, // printing characters, excluding letters and digits and space
    space: y, // white space (the same as \s from PCRE 8.34)
    upper: y, // upper case letters
    word: y, // "word" characters (same as \w)
    xdigit: y, // hexadecimal digits
  },

  modes: {
    i: 'caseinsensitive',
    s: 'dotall',
    m: 'multiline',
    x: 'freespacing',
    J: 'samename',
    U: 'switchlazy',
  },

  tokens: {
    // note that not all of these are actively used in the lexer, but are included for completeness.
    open: y, // opening /
    close: y, // closing /
    char: y, // abc

    // classes:
    // also in escCharTypes and charTypes
    set: y, // [a-z]
    setnot: y, // [^a-z]
    setclose: y, // ]
    range: y, // [a-z]
    unicodecat: y, // \p{Ll} \P{^Ll} \pL
    notunicodecat: y, // \P{Ll} \p{^Ll} \PL
    unicodescript: y, // \p{Cherokee} \P{^Cherokee}
    notunicodescript: y, // \P{Cherokee} \p{^Cherokee}
    posixcharclass: y, // [[:alpha:]]
    // not in supported flavors:	"posixcollseq": y, // [[.foo.]] // this is recognized by the lexer, currently returns "notsupported" error
    // not in supported flavors:	"unicodeblock": y, // \p{InThai} \p{IsThai} and NOT \P
    // not in supported flavors:	"subtract": y, // [base-[subtract]]
    // not in supported flavors:	"intersect": y, // [base&&[intersect]]

    // esc:
    // also in escCharCodes and escCharTypes
    escoctal: y, // \11
    escunicodeu: y, // \uFFFF
    escunicodeub: y, // \u{00A9}
    escunicodexb: y, // \x{00A9}
    escsequence: y, // \Q...\E
    eschexadecimal: y, // \xFF
    esccontrolchar: y, // \cA
    escoctalo: y, // \o{377} // resolved to escoctal in lexer, no docs required
    escchar: y, // \m (unrecognized escapes) // no reference documentation required

    // group:
    group: y, // (foo)
    groupclose: y, // )
    noncapgroup: y, // (?:foo)
    namedgroup: y, // (?P<name>foo) (?<name>foo) (?'name'foo)
    atomic: y, // (?>foo|bar)
    define: y, // (?(DEFINE)foo)
    branchreset: y, // (?|(a)|(b))

    // lookaround:
    poslookbehind: y, // (?<=foo)
    neglookbehind: y, // (?<!foo)
    poslookahead: y, // (?=foo)
    neglookahead: y, // (?!foo)

    // ref:
    namedref: y, // \k<name> \k'name' \k{name} (?P=name)  \g{name}
    numref: y, // \1
    extnumref: y, // \g{-1} \g{+1} \g{1} \g1 \g-1
    recursion: y, // (?R) (?0) \g<0> \g'0'
    numsubroutine: y, // \g<1> \g'-1' (?1) (?-1)
    namedsubroutine: y, // \g<name> \g'name' (?&name) (?P>name)

    // quantifiers:
    // also in specialChars
    quant: y, // {1,2}
    possessive: y, // ++
    lazy: y, // ?

    // special:
    conditional: y, // (?(?=if)then|else)
    condition: y, // (?=if) any lookaround
    conditionalelse: y, // |
    conditionalgroup: y, // (?(1)a|b) (?(-1)a|b) (?(name)a|b)
    mode: y, // (?i-x) see modes above
    comment: y, // (?#comment)

    // meta:
    matchanyset: y, // [\s\S]
  },

  substTokens: {
    // named references aren't supported in JS or PCRE / PHP
    subst_$esc: y, // $$
    'subst_$&match': y, // $&
    subst_$before: y, // $`
    subst_$after: y, // $'
    subst_$group: y, // $1 $99 // resolved to subst_group in lexer, no docs required
    subst_$bgroup: y, // ${1} ${99} // resolved to subst_group in lexer, no docs required
    subst_bsgroup: y, // \1 \99 // resolved to subst_group in lexer, no docs required
    subst_group: y, // $1 \1 \{1} // combined in docs, not used by lexer
    subst_0match: y, // $0 \0 \{0}

    // this isn't a feature of the engine, but of RegExr:
    subst_esc: y, // \n \r \u1234
  },

  config: {
    forwardref: y, // \1(a)
    nestedref: y, // (\1a|b)+
    ctrlcodeerr: y, // does \c error? (vs decompose)
    reftooctalalways: y, // does a single digit reference \1 become an octal? (vs remain an unmatched ref)
    substdecomposeref: y, // will a subst reference decompose? (ex. \3 becomes "\" & "3" if < 3 groups)
    looseesc: y, // should unrecognized escape sequences match the character (ex. \u could match "u") // disabled when `u` flag is set
    unicodenegated: y, // \p{^etc}"
    namedgroupalt: y, // if false, only support (?<name>foo)
  },

  docs: {
    // for example:
    //possessive: {desc: "+This will be appended to the existing entry." },
    //namedgroup: {tip: "This will overwrite the existing entry." }
  },
}

module.exports = core