Kind of touchy aren't they. For a parser generator that's not suitable for production use (to paraphrase), it certainly has a lot of users expecting it to work. At least that's my impression.
Incidentally, the token definitions for COBOL "letters" looks like this:
/*
* The following comprises the set of characters allowed in user-defined words.
* The characters include the letters, ideographic and syllabic characters, digits,
* modifiers, and combining marks recommended for programming language identifiers
* in Annex A of ISO/IEC TR 10176:2003. These characters can be used to write
* many natural languages of the world.
* It also corresponds to the set allowed by the committee draft COBOL standard
* ISO/IEC 1989:20xx. If/when this becomes a standard, this set will be aligned if it
* differs from this draft.
*/
< COBOL_STATE, FUNCTION_STATE, SQL_STATE, JAVA_STATE > TOKEN :
< #LATIN:
[
"\u0041"-"\u005A",
"\u0061"-"\u007A",
"\u00AA", "\u00BA", "\u00C0"-"\u00D6", "\u00D8"-"\u00F6", "\u00F8"-"\u01BA", "\u01BB", "\u01BC"-"\u01BF",
"\u01C0"-"\u01C3", "\u01C4"-"\u021F", "\u0222"-"\u0233", "\u0250"-"\u02AD", "\u1E00"-"\u1E9B", "\u1EA0"-"\u1EF9", "\u207F"
]
>
|
< #GREEK:
[
"\u0386", "\u0388"-"\u038A", "\u038C", "\u038E"-"\u03A1", "\u03A3"-"\u03CE", "\u03D0"-"\u03D7", "\u03DA"-"\u03F3", "\u1F00"-"\u1F15", "\u1F18"-
"\u1F1D", "\u1F20"-"\u1F45", "\u1F48"-"\u1F4D", "\u1F50"-"\u1F57", "\u1F59", "\u1F5B", "\u1F5D", "\u1F5F"-"\u1F7D", "\u1F80"-"\u1FB4", "\u1FB6"-"\u1FBC",
"\u1FC2"-"\u1FC4", "\u1FC6"-"\u1FCC", "\u1FD0"-"\u1FD3", "\u1FD6"-"\u1FDB", "\u1FE0"-"\u1FEC", "\u1FF2"-"\u1FF4", "\u1FF6"-"\u1FFC"
]
>
|
< #CYRILLIC:
[
"\u0400"-"\u0481", "\u048C"-"\u04C4", "\u04C7"-"\u04C8", "\u04CB"-"\u04CC", "\u04D0"-"\u04F5", "\u04F8"-"\u04F9"
]
>
|
< #ARMENIAN:
[
"\u0531"-"\u0556", "\u0561"-"\u0587"
]
>
|
< #HEBREW:
[
"\u05B0"-"\u05B9", "\u05BB"-"\u05BD", "\u05BF", "\u05C1"-"\u05C2", "\u05D0"-"\u05EA", "\u05F0"-"\u05F2"
]
>
|
< #ARABIC:
[
"\u0621"-"\u063A", "\u0640", "\u0641"-"\u064A", "\u064B"-"\u0652", "\u0670", "\u0671"-"\u06D3", "\u06D5", "\u06D6"-"\u06DC", "\u06E5"-"\u06E6",
"\u06E7"-"\u06E8", "\u06EA"-"\u06ED", "\u06FA"-"\u06FC"
]
>
|
< #SYRIAC:
[
"\u0710", "\u0711", "\u0712"-"\u072C"
]
>
|
< #THAANA:
[
"\u0780"-"\u07A5", "\u07A6"-"\u07B0"
]
>
|
< #DEVANAGARI:
[
"\u0901"-"\u0902", "\u0903", "\u0905"-"\u0939", "\u093D", "\u093E"-"\u0940", "\u0941"-"\u0948", "\u0949"-"\u094C", "\u094D", "\u0950", "\u0951"-"\u0952",
"\u0958"-"\u0961", "\u0962"-"\u0963"
]
>
|
< #BENGALI:
[
"\u0981", "\u0982"-"\u0983", "\u0985"-"\u098C", "\u098F"-"\u0990", "\u0993"-"\u09A8", "\u09AA"-"\u09B0", "\u09B2", "\u09B6"-"\u09B9", "\u09BE"-"\u09C0",
"\u09C1"-"\u09C4", "\u09C7"-"\u09C8", "\u09CB"-"\u09CC", "\u09CD", "\u09DC"-"\u09DD", "\u09DF"-"\u09E1", "\u09E2"-"\u09E3", "\u09F0"-"\u09F1"
]
>
|
< #GURMUKHI:
[
"\u0A02", "\u0A05"-"\u0A0A", "\u0A0F"-"\u0A10", "\u0A13"-"\u0A28", "\u0A2A"-"\u0A30", "\u0A32"-"\u0A33", "\u0A35"-"\u0A36", "\u0A38"-"\u0A39",
"\u0A3E"-"\u0A40", "\u0A41"-"\u0A42", "\u0A47"-"\u0A48", "\u0A4B"-"\u0A4D", "\u0A59"-"\u0A5C", "\u0A5E", "\u0A72"-"\u0A74"
]
>
|
< #GUJARATI:
[
"\u0A81"-"\u0A82", "\u0A83", "\u0A85"-"\u0A8B", "\u0A8D", "\u0A8F"-"\u0A91", "\u0A93"-"\u0AA8", "\u0AAA"-"\u0AB0", "\u0AB2"-"\u0AB3", "\u0AB5"-
"\u0AB9", "\u0ABD", "\u0ABE"-"\u0AC0", "\u0AC1"-"\u0AC5", "\u0AC7"-"\u0AC8", "\u0AC9", "\u0ACB"-"\u0ACC", "\u0ACD", "\u0AD0", "\u0AE0"
]
>
|
< #ORIYA:
[
"\u0B01", "\u0B02"-"\u0B03", "\u0B05"-"\u0B0C", "\u0B0F"-"\u0B10", "\u0B13"-"\u0B28", "\u0B2A"-"\u0B30", "\u0B32"-"\u0B33", "\u0B36"-"\u0B39",
"\u0B3D", "\u0B3E", "\u0B3F", "\u0B40", "\u0B41"-"\u0B43", "\u0B47"-"\u0B48", "\u0B4B"-"\u0B4C", "\u0B4D", "\u0B5C"-"\u0B5D", "\u0B5F"-"\u0B61"
]
>
|
< #TAMIL:
[
"\u0B82", "\u0B83", "\u0B85"-"\u0B8A", "\u0B8E"-"\u0B90", "\u0B92"-"\u0B95", "\u0B99"-"\u0B9A", "\u0B9C", "\u0B9E"-"\u0B9F", "\u0BA3"-"\u0BA4",
"\u0BA8"-"\u0BAA", "\u0BAE"-"\u0BB5", "\u0BB7"-"\u0BB9", "\u0BBE"-"\u0BBF", "\u0BC0", "\u0BC1"-"\u0BC2", "\u0BC6"-"\u0BC8", "\u0BCA"-"\u0BCC",
"\u0BCD"
]
>
|
< #TELUGU:
[
"\u0C01"-"\u0C03", "\u0C05"-"\u0C0C", "\u0C0E"-"\u0C10", "\u0C12"-"\u0C28", "\u0C2A"-"\u0C33", "\u0C35"-"\u0C39", "\u0C3E"-"\u0C40", "\u0C41"-
"\u0C44", "\u0C46"-"\u0C48", "\u0C4A"-"\u0C4D", "\u0C60"-"\u0C61"
]
>
|
< #KANNADA:
[
"\u0C82"-"\u0C83", "\u0C85"-"\u0C8C", "\u0C8E"-"\u0C90", "\u0C92"-"\u0CA8", "\u0CAA"-"\u0CB3", "\u0CB5"-"\u0CB9", "\u0CBE", "\u0CBF", "\u0CC0"-
"\u0CC4", "\u0CC6", "\u0CC7"-"\u0CC8", "\u0CCA"-"\u0CCB", "\u0CCC"-"\u0CCD", "\u0CDE", "\u0CE0"-"\u0CE1"
]
>
|
< #MALAYALAM:
[
"\u0D02"-"\u0D03", "\u0D05"-"\u0D0C", "\u0D0E"-"\u0D10", "\u0D12"-"\u0D28", "\u0D2A"-"\u0D39", "\u0D3E"-"\u0D40", "\u0D41"-"\u0D43", "\u0D46"-
"\u0D48", "\u0D4A"-"\u0D4C", "\u0D4D", "\u0D60"-"\u0D61"
]
>
|
< #SINHALA:
[
"\u0D82"-"\u0D83", "\u0D85"-"\u0D96", "\u0D9A"-"\u0DB1", "\u0DB3"-"\u0DBB", "\u0DBD", "\u0DC0"-"\u0DC6", "\u0DCA", "\u0DCF"-"\u0DD1",
"\u0DD2"-"\u0DD4", "\u0DD6", "\u0DD8"-"\u0DDF", "\u0DF2"-"\u0DF3"
]
>
|
< #THAI:
[
"\u0E01"-"\u0E30", "\u0E31", "\u0E32"-"\u0E33", "\u0E34"-"\u0E3A", "\u0E40"-"\u0E45", "\u0E46", "\u0E47"-"\u0E4E"
]
>
|
< #LAO:
[
"\u0E81"-"\u0E82", "\u0E84", "\u0E87"-"\u0E88", "\u0E8A", "\u0E8D", "\u0E94"-"\u0E97", "\u0E99"-"\u0E9F", "\u0EA1"-"\u0EA3", "\u0EA5", "\u0EA7",
"\u0EAA"-"\u0EAB", "\u0EAD"-"\u0EB0", "\u0EB1", "\u0EB2"-"\u0EB3", "\u0EB4"-"\u0EB9", "\u0EBB"-"\u0EBC", "\u0EBD", "\u0EC0"-"\u0EC4", "\u0EC6",
"\u0EC8"-"\u0ECD", "\u0EDC"-"\u0EDD"
]
>
|
< #TIBETAN:
[
"\u0F00", "\u0F18"-"\u0F19", "\u0F35", "\u0F37", "\u0F39", "\u0F40"-"\u0F47", "\u0F49"-"\u0F6A", "\u0F71"-"\u0F7E", "\u0F7F", "\u0F80"-"\u0F84", "\u0F86"-
"\u0F87", "\u0F88"-"\u0F8B", "\u0F90"-"\u0F97", "\u0F99"-"\u0FBC"
]
>
|
< #MYANMAR:
[
"\u1000"-"\u1021", "\u1023"-"\u1027", "\u1029"-"\u102A", "\u102C", "\u102D"-"\u1030", "\u1031", "\u1032", "\u1036"-"\u1037", "\u1038", "\u1039", "\u1050"-
"\u1055", "\u1056"-"\u1057", "\u1058"-"\u1059"
]
>
|
< #GEORGIAN:
[
"\u10A0"-"\u10C5", "\u10D0"-"\u10F6"
]
>
|
< #ETHIOPIC:
[
"\u1200"-"\u1206", "\u1208"-"\u1246", "\u1248", "\u124A"-"\u124D", "\u1250"-"\u1256", "\u1258", "\u125A"-"\u125D", "\u1260"-"\u1286", "\u1288", "\u128A"-
"\u128D", "\u1290"-"\u12AE", "\u12B0", "\u12B2"-"\u12B5", "\u12B8"-"\u12BE", "\u12C0", "\u12C2"-"\u12C5", "\u12C8"-"\u12CE", "\u12D0"-"\u12D6",
"\u12D8"-"\u12EE", "\u12F0"-"\u130E", "\u1310", "\u1312"-"\u1315", "\u1318"-"\u131E", "\u1320"-"\u1346", "\u1348"-"\u135A"
]
>
|
< #CHEROKEE:
[
"\u13A0"-"\u13F4"
]
>
|
< #SYLLABICS:
[
"\u1401"-"\u166C", "\u166F"-"\u1676"
]
>
|
< #OGHAM:
[
"\u1681"-"\u169A"
]
>
|
< #RUNIC:
[
"\u16A0"-"\u16EA", "\u16EE"-"\u16F0"
]
>
|
< #KHMER:
[
"\u1780"-"\u17B3", "\u17B4"-"\u17B6", "\u17B7"-"\u17BD", "\u17BE"-"\u17C5", "\u17C6", "\u17C7"-"\u17C8", "\u17C9"-"\u17D3"
]
>
|
< #MONGOLIAN:
[
"\u1820"-"\u1842", "\u1843", "\u1844"-"\u1877", "\u1880"-"\u18A8", "\u18A9"
]
>
|
< #HIRAGANA:
[
"\u3041"-"\u3094"
]
>
|
< #KATAKANA:
[
"\u30A1"-"\u30FA", "\u30FB", "\u30FC"
]
>
|
< #BOPOMOFO:
[
"\u3105"-"\u312C", "\u31A0"-"\u31B7"
]
>
|
< #UNIFIED_IDEOGRAPHS:
[
"\u3400"-"\u4DB5", "\u4E00"-"\u9FA5", "\uFA0E"-"\uFA0F", "\uFA11", "\uFA13"-"\uFA14", "\uFA1F", "\uFA21", "\uFA23"-"\uFA24", "\uFA27"-"\uFA29"
]
>
|
< #YI:
[
"\uA000"-"\uA48C"
]
>
|
< #HANGUL:
[
"\uAC00"-"\uD7A3"
]
>
|
< #DIGIT:
[
"\u0030"-"\u0039", "\u0660"-"\u0669", "\u06F0"-"\u06F9", "\u0966"-"\u096F", "\u09E6"-"\u09EF", "\u0A66"-"\u0A6F", "\u0AE6"-"\u0AEF", "\u0B66"-"\u0B6F",
"\u0BE7"-"\u0BEF", "\u0C66"-"\u0C6F", "\u0CE6"-"\u0CEF", "\u0D66"-"\u0D6F", "\u0E50"-"\u0E59", "\u0ED0"-"\u0ED9", "\u0F20"-"\u0F29", "\u1040"-
"\u1049", "\u1369"-"\u1371", "\u17E0"-"\u17E9", "\u1810"-"\u1819"
]
>
|
< #SPECIAL_LETTERS:
[
"\u00B5", "\u02B0"-"\u02B8", "\u02BB"-"\u02C1", "\u02D0"-"\u02D1", "\u02E0"-"\u02E4", "\u02EE", "\u037A", "\u0559", "\u1FBE", "\u203F"-"\u2040",
"\u2102", "\u2107", "\u210A"-"\u2113", "\u2115", "\u2119"-"\u211D", "\u2124", "\u2126", "\u2128", "\u212A"-"\u212D", "\u212F"-"\u2131", "\u2133"-"\u2134",
"\u2135"-"\u2138", "\u2139", "\u2160"-"\u2183", "\u3005", "\u3006", "\u3007", "\u3021"-"\u3029", "\u3038"-"\u303A"
]
>
|
< #ADDITIONAL_CHARS:
[
"$",
"\u005F", // (low line)
"\u00B7" // (middle dot)
]
>
;
< COBOL_STATE, FUNCTION_STATE, SQL_STATE, JAVA_STATE > TOKEN :
< #LETTER:
<LATIN> |
<GREEK> |
<CYRILLIC> |
<ARMENIAN> |
<HEBREW> |
<ARABIC> |
<SYRIAC> |
<THAANA> |
<DEVANAGARI> |
<BENGALI> |
<GURMUKHI> |
<GUJARATI> |
<ORIYA> |
<TAMIL> |
<TELUGU> |
<KANNADA> |
<MALAYALAM> |
<SINHALA> |
<THAI> |
<LAO> |
<TIBETAN> |
<MYANMAR> |
<GEORGIAN> |
<ETHIOPIC> |
<CHEROKEE> |
<SYLLABICS> |
<OGHAM> |
<RUNIC> |
<KHMER> |
<MONGOLIAN> |
<HIRAGANA> |
<KATAKANA> |
<BOPOMOFO> |
<UNIFIED_IDEOGRAPHS > |
<YI> |
<HANGUL> |
<SPECIAL_LETTERS> |
<ADDITIONAL_CHARS>
>
;