// This file is part of Visual D // // Visual D integrates the D programming language into Visual Studio // Copyright (c) 2010-2011 by Rainer Schuetze, All Rights Reserved // // Distributed under the Boost Software License, Version 1.0. // See accompanying file LICENSE_1_0.txt or copy at http://www.boost.org/LICENSE_1_0.txt module vdc.lexer; import std.ascii; import std.uni : isAlpha; import std.utf; import std.conv; enum supportUnorderedCompareOps = false; // current limitations: // - nested comments must not nest more than 255 times // - braces must not nest more than 4095 times inside token string // - number of different delimiters must not exceed 256 enum TokenCat : int { // assumed to match beginning of visuald.colorizer.TokenColor Text, Keyword, Comment, Identifier, String, Literal, Text2, Operator, } struct TokenInfo { TokenCat type; int tokid; int StartIndex; int EndIndex; } /////////////////////////////////////////////////////////////////////////////// struct Lexer { enum State { kWhite, kBlockComment, kNestedComment, kStringCStyle, kStringWysiwyg, kStringAltWysiwyg, kStringDelimited, kStringDelimitedNestedBracket, kStringDelimitedNestedParen, kStringDelimitedNestedBrace, kStringDelimitedNestedAngle, kStringTokenFirst, // after 'q', but before '{' to pass '{' as single operator kStringToken, // encoded by tokenStringLevel > 0 kStringHex, // for now, treated as State.kStringWysiwyg kStringEscape, // removed in D2.026, not supported } // lexer scan state is: ___TTNNS // TT: token string nesting level // NN: comment nesting level/string delimiter id // S: State static State scanState(int state) { return cast(State) (state & 0xf); } static int nestingLevel(int state) { return (state >> 4) & 0xff; } // used for state kNestedComment and kStringDelimited static int tokenStringLevel(int state) { return (state >> 12) & 0xff; } static int getOtherState(int state) { return (state & 0xfff00000); } bool mTokenizeTokenString = true; bool mSplitNestedComments = true; bool mAllowDollarInIdentifiers = false; static int toState(State s, int nesting, int tokLevel, int otherState) { static assert(State.kStringToken <= 15); assert(s >= State.kWhite && s <= State.kStringToken); assert(nesting < 32); assert(tokLevel < 32); return s | ((nesting & 0xff) << 4) | ((tokLevel & 0xff) << 12) | otherState; } static bool isStringState(State state) { return state >= State.kStringCStyle; } static bool isCommentState(State state) { return state == State.kBlockComment || state == State.kNestedComment; } static string[256] s_delimiters; static int s_nextDelimiter; static int getDelimiterIndex(string delim) { int idx = (s_nextDelimiter - 1) & 0xff; for( ; idx != s_nextDelimiter; idx = (idx - 1) & 0xff) if(delim == s_delimiters[idx]) return idx; s_nextDelimiter = (s_nextDelimiter + 1) & 0xff; s_delimiters[idx] = delim; return idx; } int scanIdentifier(S)(S text, size_t startpos, ref size_t pos) { int pid; return scanIdentifier(text, startpos, pos, pid); } int scanIdentifier(S)(S text, size_t startpos, ref size_t pos, ref int pid) { while(pos < text.length) { auto nextpos = pos; dchar ch = decode(text, nextpos); if(!isIdentifierCharOrDigit(ch)) break; pos = nextpos; } string ident = toUTF8(text[startpos .. pos]); if(findKeyword(ident, pid)) return pid == TOK_is ? TokenCat.Operator : TokenCat.Keyword; if(findSpecial(ident, pid)) return TokenCat.String; pid = TOK_Identifier; return TokenCat.Identifier; } static int scanOperator(S)(S text, size_t startpos, ref size_t pos, ref int pid) { size_t len; int id = parseOperator(text, startpos, len); if(id == TOK_error) return TokenCat.Text; pid = id; pos = startpos + len; return TokenCat.Operator; } static dchar trydecode(S)(S text, ref size_t pos) { if(pos >= text.length) return 0; dchar ch = decode(text, pos); return ch; } static void skipDigits(S)(S text, ref size_t pos, int base) { while(pos < text.length) { auto nextpos = pos; dchar ch = decode(text, nextpos); if(ch != '_') { if(base < 16 && (ch < '0' || ch >= '0' + base)) break; else if(base == 16 && !isHexDigit(ch)) break; } pos = nextpos; } } static int scanNumber(S)(S text, dchar ch, ref size_t pos) { int pid; return scanNumber(text, ch, pos, pid); } static int scanNumber(S)(S text, dchar ch, ref size_t pos, ref int pid) { // pos after first digit int base = 10; size_t nextpos = pos; if(ch == '.') goto L_float; if(ch == '0') { size_t prevpos = pos; ch = trydecode(text, pos); ch = toLower(ch); if(ch == 'b') base = 2; else if (ch == 'x') base = 16; else { base = 8; pos = prevpos; } } // pos now after prefix or first digit skipDigits(text, pos, base); // pos now after last digit of integer part nextpos = pos; ch = trydecode(text, nextpos); if((base == 10 && toLower(ch) == 'e') || (base == 16 && toLower(ch) == 'p')) goto L_exponent; if(base >= 8 && ch == '.') // ".." is the slice token { { // mute errors about goto skipping declaration size_t trypos = nextpos; dchar trych = trydecode(text, trypos); if (trych == '.') goto L_integer; //if (isAlpha(trych) || trych == '_' || (p[1] & 0x80)) // goto done; } // float if(base < 10) base = 10; L_float: pos = nextpos; skipDigits(text, pos, base); nextpos = pos; ch = trydecode(text, nextpos); if((base == 10 && toLower(ch) == 'e') || (base == 16 && toLower(ch) == 'p')) { L_exponent: // exponent pos = nextpos; ch = trydecode(text, nextpos); if(ch == '-' || ch == '+') pos = nextpos; skipDigits(text, pos, 10); } // suffix nextpos = pos; ch = trydecode(text, nextpos); if(ch == 'L' || toUpper(ch) == 'F') { L_floatLiteral: pos = nextpos; ch = trydecode(text, nextpos); } if(ch == 'i') L_complexLiteral: pos = nextpos; pid = TOK_FloatLiteral; } else { // check integer suffix if(ch == 'i') goto L_complexLiteral; if(toUpper(ch) == 'F') goto L_floatLiteral; if(toUpper(ch) == 'U') { pos = nextpos; ch = trydecode(text, nextpos); if(ch == 'L') pos = nextpos; } else if (ch == 'L') { pos = nextpos; ch = trydecode(text, nextpos); if(ch == 'i') goto L_complexLiteral; if(toUpper(ch) == 'U') pos = nextpos; } L_integer: pid = TOK_IntegerLiteral; } return TokenCat.Literal; } version(unspecified) unittest { int pid; size_t pos = 1; auto cat = scanNumber("0.0i", '0', pos, pid); assert(pid == TOK_FloatLiteral); pos = 1; cat = scanNumber("0.i", '0', pos, pid); assert(pid == TOK_IntegerLiteral); } static State scanBlockComment(S)(S text, ref size_t pos) { while(pos < text.length) { dchar ch = decode(text, pos); while(ch == '*') { if (pos >= text.length) return State.kBlockComment; ch = decode(text, pos); if(ch == '/') return State.kWhite; } } return State.kBlockComment; } State scanNestedComment(S)(S text, size_t startpos, ref size_t pos, ref int nesting) { while(pos < text.length) { dchar ch = decode(text, pos); while(ch == '/') { if (pos >= text.length) return State.kNestedComment; ch = decode(text, pos); if(ch == '+') { if(mSplitNestedComments && pos > startpos + 2) { pos -= 2; return State.kNestedComment; } nesting++; goto nextChar; } } while(ch == '+') { if (pos >= text.length) return State.kNestedComment; ch = decode(text, pos); if(ch == '/') { nesting--; if(nesting == 0) return State.kWhite; if(mSplitNestedComments) return State.kNestedComment; break; } } nextChar:; } return State.kNestedComment; } static State scanStringPostFix(S)(S text, ref size_t pos) { size_t nextpos = pos; dchar ch = trydecode(text, nextpos); if(ch == 'c' || ch == 'w' || ch == 'd') pos = nextpos; return State.kWhite; } static State scanStringWysiwyg(S)(S text, ref size_t pos) { while(pos < text.length) { dchar ch = decode(text, pos); if(ch == '"') return scanStringPostFix(text, pos); } return State.kStringWysiwyg; } static State scanStringAltWysiwyg(S)(S text, ref size_t pos) { while(pos < text.length) { dchar ch = decode(text, pos); if(ch == '`') return scanStringPostFix(text, pos); } return State.kStringAltWysiwyg; } static State scanStringCStyle(S)(S text, ref size_t pos, dchar term) { while(pos < text.length) { dchar ch = decode(text, pos); if(ch == '\\') { if (pos >= text.length) break; ch = decode(text, pos); } else if(ch == term) return scanStringPostFix(text, pos); } return State.kStringCStyle; } State startDelimiterString(S)(S text, ref size_t pos, ref int nesting) { import std.uni : isWhite; nesting = 1; auto startpos = pos; dchar ch = trydecode(text, pos); State s = State.kStringDelimited; if(ch == '[') s = State.kStringDelimitedNestedBracket; else if(ch == '(') s = State.kStringDelimitedNestedParen; else if(ch == '{') s = State.kStringDelimitedNestedBrace; else if(ch == '<') s = State.kStringDelimitedNestedAngle; else if(ch == 0 || isWhite(ch)) // bad delimiter, fallback to wysiwyg string s = State.kStringWysiwyg; else { if(isIdentifierChar(ch)) scanIdentifier(text, startpos, pos); string delim = toUTF8(text[startpos .. pos]); nesting = getDelimiterIndex(delim); } return s; } State scanTokenString(S)(S text, ref size_t pos, ref int tokLevel) { int state = toState(State.kWhite, 0, 0, 0); int id = -1; while(pos < text.length && tokLevel > 0) { int type = scan(state, text, pos, id); if(id == TOK_lcurly) tokLevel++; else if(id == TOK_rcurly) tokLevel--; } return (tokLevel > 0 ? State.kStringToken : State.kWhite); } static bool isStartingComment(S)(S txt, ref size_t idx) { if(idx >= 0 && idx < txt.length-1 && txt[idx] == '/' && (txt[idx+1] == '*' || txt[idx+1] == '+')) return true; if((txt[idx] == '*' || txt[idx] == '+') && idx > 0 && txt[idx-1] == '/') { idx--; return true; } return false; } static bool isEndingComment(S)(S txt, ref size_t pos) { if(pos < txt.length && pos > 0 && txt[pos] == '/' && (txt[pos-1] == '*' || txt[pos-1] == '+')) { pos--; return true; } if(pos < txt.length-1 && pos >= 0 && (txt[pos] == '*' || txt[pos] == '+') && txt[pos+1] == '/') return true; return false; } bool isIdentifierChar(dchar ch) { if(mAllowDollarInIdentifiers && ch == '$') return true; return isAlpha(ch) || ch == '_' || ch == '@'; } bool isIdentifierCharOrDigit(dchar ch) { return isIdentifierChar(ch) || isDigit(ch); } bool isIdentifier(S)(S text) { if(text.length == 0) return false; size_t pos; dchar ch = decode(text, pos); if(!isIdentifierChar(ch)) return false; while(pos < text.length) { ch = decode(text, pos); if(!isIdentifierCharOrDigit(ch)) return false; } return true; } static bool isInteger(S)(S text) { if(text.length == 0) return false; size_t pos; while(pos < text.length) { dchar ch = decode(text, pos); if(!isDigit(ch)) return false; } return true; } static bool isBracketPair(dchar ch1, dchar ch2) { switch(ch1) { case '{': return ch2 == '}'; case '}': return ch2 == '{'; case '(': return ch2 == ')'; case ')': return ch2 == '('; case '[': return ch2 == ']'; case ']': return ch2 == '['; default: return false; } } static bool isOpeningBracket(dchar ch) { return ch == '[' || ch == '(' || ch == '{'; } static bool isClosingBracket(dchar ch) { return ch == ']' || ch == ')' || ch == '}'; } static dchar openingBracket(State s) { switch(s) { case State.kStringDelimitedNestedBracket: return '['; case State.kStringDelimitedNestedParen: return '('; case State.kStringDelimitedNestedBrace: return '{'; case State.kStringDelimitedNestedAngle: return '<'; default: break; } assert(0); } static dchar closingBracket(State s) { switch(s) { case State.kStringDelimitedNestedBracket: return ']'; case State.kStringDelimitedNestedParen: return ')'; case State.kStringDelimitedNestedBrace: return '}'; case State.kStringDelimitedNestedAngle: return '>'; default: break; } assert(0); } static bool isCommentOrSpace(S)(int type, S text) { return (type == TokenCat.Comment || (type == TokenCat.Text && isWhite(text[0]))); } static State scanNestedDelimiterString(S)(S text, ref size_t pos, State s, ref int nesting) { dchar open = openingBracket(s); dchar close = closingBracket(s); while(pos < text.length) { dchar ch = decode(text, pos); if(ch == open) nesting++; else if(ch == close && nesting > 0) nesting--; else if(ch == '"' && nesting == 0) return scanStringPostFix(text, pos); } return s; } State scanDelimitedString(S)(S text, ref size_t pos, ref int delim) { string delimiter = s_delimiters[delim]; while(pos < text.length) { auto startpos = pos; dchar ch = decode(text, pos); if(isIdentifierChar(ch)) scanIdentifier(text, startpos, pos); string ident = toUTF8(text[startpos .. pos]); if(ident == delimiter) { ch = trydecode(text, pos); if(ch == '"') { delim = 0; // reset delimiter id, it shadows nesting return scanStringPostFix(text, pos); } } } return State.kStringDelimited; } int scan(S)(ref int state, in S text, ref size_t pos, ref int id) { State s = scanState(state); int nesting = nestingLevel(state); int tokLevel = tokenStringLevel(state); int otherState = getOtherState(state); int type = TokenCat.Text; size_t startpos = pos; dchar ch; id = TOK_Space; switch(s) { case State.kWhite: ch = decode(text, pos); if(ch == 'r' || ch == 'x' || ch == 'q') { size_t prevpos = pos; dchar nch = trydecode(text, pos); if(nch == '"' && ch == 'q') { s = startDelimiterString(text, pos, nesting); if(s == State.kStringDelimited) goto case State.kStringDelimited; else if(s == State.kStringWysiwyg) goto case State.kStringWysiwyg; else goto case State.kStringDelimitedNestedBracket; } else if(tokLevel == 0 && ch == 'q' && nch == '{') { type = TokenCat.String; id = TOK_StringLiteral; if(mTokenizeTokenString) { pos = prevpos; s = State.kStringTokenFirst; } else { tokLevel = 1; s = scanTokenString(text, pos, tokLevel); } break; } else if(nch == '"') { goto case State.kStringWysiwyg; } else { pos = prevpos; type = scanIdentifier(text, startpos, pos, id); } } else if(isIdentifierChar(ch)) type = scanIdentifier(text, startpos, pos, id); else if(isDigit(ch)) type = scanNumber(text, ch, pos, id); else if (ch == '.') { size_t nextpos = pos; ch = trydecode(text, nextpos); if(isDigit(ch)) type = scanNumber(text, '.', pos, id); else type = scanOperator(text, startpos, pos, id); } else if (ch == '/') { size_t prevpos = pos; ch = trydecode(text, pos); if (ch == '/') { // line comment type = TokenCat.Comment; id = TOK_Comment; while(pos < text.length && decode(text, pos) != '\n') {} } else if (ch == '*') { s = scanBlockComment(text, pos); type = TokenCat.Comment; id = TOK_Comment; } else if (ch == '+') { nesting = 1; s = scanNestedComment(text, startpos, pos, nesting); type = TokenCat.Comment; id = TOK_Comment; } else { // step back to position after '/' pos = prevpos; type = scanOperator(text, startpos, pos, id); } } else if (ch == '"') goto case State.kStringCStyle; else if (ch == '`') goto case State.kStringAltWysiwyg; else if (ch == '\'') { s = scanStringCStyle(text, pos, '\''); id = TOK_CharacterLiteral; type = TokenCat.String; } else if (ch == '#') { // display #! or #line as line comment type = TokenCat.Comment; id = TOK_Comment; while(pos < text.length && decode(text, pos) != '\n') {} } else { if (tokLevel > 0) { if(ch == '{') tokLevel++; else if (ch == '}') tokLevel--; if(!isWhite(ch)) type = scanOperator(text, startpos, pos, id); id = TOK_StringLiteral; } else if(!isWhite(ch)) type = scanOperator(text, startpos, pos, id); } break; case State.kStringTokenFirst: ch = decode(text, pos); assert(ch == '{'); tokLevel = 1; type = TokenCat.Operator; id = TOK_StringLiteral; s = State.kWhite; break; case State.kStringToken: type = TokenCat.String; id = TOK_StringLiteral; s = scanTokenString(text, pos, tokLevel); break; case State.kBlockComment: s = scanBlockComment(text, pos); type = TokenCat.Comment; id = TOK_Comment; break; case State.kNestedComment: s = scanNestedComment(text, pos, pos, nesting); type = TokenCat.Comment; id = TOK_Comment; break; case State.kStringCStyle: s = scanStringCStyle(text, pos, '"'); type = TokenCat.String; id = TOK_StringLiteral; break; case State.kStringWysiwyg: s = scanStringWysiwyg(text, pos); type = TokenCat.String; id = TOK_StringLiteral; break; case State.kStringAltWysiwyg: s = scanStringAltWysiwyg(text, pos); type = TokenCat.String; id = TOK_StringLiteral; break; case State.kStringDelimited: s = scanDelimitedString(text, pos, nesting); type = TokenCat.String; id = TOK_StringLiteral; break; case State.kStringDelimitedNestedBracket: case State.kStringDelimitedNestedParen: case State.kStringDelimitedNestedBrace: case State.kStringDelimitedNestedAngle: s = scanNestedDelimiterString(text, pos, s, nesting); type = TokenCat.String; id = TOK_StringLiteral; break; default: break; } state = toState(s, nesting, tokLevel, otherState); if(tokLevel > 0) id = TOK_StringLiteral; return type; } int scan(S)(ref int state, in S text, ref size_t pos) { int id; return scan(state, text, pos, id); } /////////////////////////////////////////////////////////////// TokenInfo[] ScanLine(S)(int iState, S text) { TokenInfo[] lineInfo; for(size_t pos = 0; pos < text.length; ) { TokenInfo info; info.StartIndex = pos; info.type = cast(TokenCat) scan(iState, text, pos, info.tokid); info.EndIndex = pos; lineInfo ~= info; } return lineInfo; } } /////////////////////////////////////////////////////////////// // converted int[string] to short[string] due to bug #2500 __gshared short[string] keywords_map; // maps to TOK enumerator __gshared short[string] specials_map; // maps to TOK enumerator alias AssociativeArray!(string, short) _wa1; // fully instantiate type info alias AssociativeArray!(int, const(int)) _wa2; // fully instantiate type info shared static this() { foreach(i, s; keywords) keywords_map[s] = cast(short) (TOK_begin_Keywords + i); foreach(i, s; specials) specials_map[s] = cast(short) i; } bool findKeyword(string ident, ref int id) { if(__ctfe) { // slow, but compiles foreach(i, k; keywords) if(k == ident) { id = cast(int) (TOK_begin_Keywords + i); return true; } } else if(auto pident = ident in keywords_map) { id = *pident; return true; } return false; } bool isKeyword(string ident) { int id; return findKeyword(ident, id); } bool findSpecial(string ident, ref int id) { if(__ctfe) { // slow, but compiles foreach(i, k; specials) if(k == ident) { id = TOK_StringLiteral; return true; } } else if(auto pident = ident in specials_map) { id = TOK_StringLiteral; return true; } return false; } const string[] keywords = [ "this", "super", "assert", "null", "true", "false", "cast", "new", "delete", "throw", "module", "pragma", "typeof", "typeid", "template", "void", "byte", "ubyte", "short", "ushort", "int", "uint", "long", "ulong", "cent", "ucent", "float", "double", "real", "bool", "char", "wchar", "dchar", "ifloat", "idouble", "ireal", "cfloat", "cdouble", "creal", "delegate", "function", "is", "if", "else", "while", "for", "do", "switch", "case", "default", "break", "continue", "synchronized", "return", "goto", "try", "catch", "finally", "with", "asm", "foreach", "foreach_reverse", "scope", "struct", "class", "interface", "union", "enum", "import", "mixin", "static", "final", "const", "typedef", "alias", "override", "abstract", "volatile", "debug", "deprecated", "in", "out", "inout", "lazy", "auto", "align", "extern", "private", "package", "protected", "public", "export", "body", "invariant", "unittest", "version", //{ "manifest", TOKmanifest }, // Added after 1.0 "ref", "macro", "pure", "nothrow", "__gshared", "__thread", "__traits", "__overloadset", "__parameters", "__argTypes", "__vector", "__FILE__", "__LINE__", "__FUNCTION__", "__PRETTY_FUNCTION__", "__MODULE__", "shared", "immutable", "@disable", "@property", "@nogc", "@safe", "@system", "@trusted", ]; // not listed as keywords, but "special tokens" const string[] specials = [ "__DATE__", "__EOF__", "__TIME__", "__TIMESTAMP__", "__VENDOR__", "__VERSION__", ]; //////////////////////////////////////////////////////////////////////// enum { TOK_begin_Generic, TOK_Space = TOK_begin_Generic, TOK_Comment, TOK_Identifier, TOK_IntegerLiteral, TOK_FloatLiteral, TOK_StringLiteral, TOK_CharacterLiteral, TOK_EOF, TOK_RECOVER, TOK_end_Generic } string genKeywordEnum(string kw) { if(kw[0] == '@') kw = kw[1..$]; return "TOK_" ~ kw; } string genKeywordsEnum(T)(const string[] kwords, T begin) { string enums = "enum { TOK_begin_Keywords = " ~ to!string(begin) ~ ", "; bool first = true; foreach(kw; kwords) { enums ~= genKeywordEnum(kw); if(first) { first = false; enums ~= " = TOK_begin_Keywords"; } enums ~= ","; } enums ~= "TOK_end_Keywords }"; return enums; } mixin(genKeywordsEnum(keywords, "TOK_end_Generic")); const string[2][] operators = [ [ "lcurly", "{" ], [ "rcurly", "}" ], [ "lparen", "(" ], [ "rparen", ")" ], [ "lbracket", "[" ], [ "rbracket", "]" ], [ "semicolon", ";" ], [ "colon", ":" ], [ "comma", "," ], [ "dot", "." ], // binary operators [ "xor", "^" ], [ "lt", "<" ], [ "gt", ">" ], [ "le", "<=" ], [ "ge", ">=" ], [ "equal", "==" ], [ "notequal", "!=" ], [ "lambda", "=>" ], [ "unord", "!<>=" ], [ "ue", "!<>" ], [ "lg", "<>" ], [ "leg", "<>=" ], [ "ule", "!>" ], [ "ul", "!>=" ], [ "uge", "!<" ], [ "ug", "!<=" ], [ "notcontains", "!in" ], [ "notidentity", "!is" ], [ "shl", "<<" ], [ "shr", ">>" ], [ "ushr", ">>>" ], [ "add", "+" ], [ "min", "-" ], [ "mul", "*" ], [ "div", "/" ], [ "mod", "%" ], [ "pow", "^^" ], [ "and", "&" ], [ "andand", "&&" ], [ "or", "|" ], [ "oror", "||" ], [ "tilde", "~" ], [ "assign", "=" ], [ "xorass", "^=" ], [ "addass", "+=" ], [ "minass", "-=" ], [ "mulass", "*=" ], [ "divass", "/=" ], [ "modass", "%=" ], [ "powass", "^^=" ], [ "shlass", "<<=" ], [ "shrass", ">>=" ], [ "ushrass", ">>>=" ], [ "andass", "&=" ], [ "orass", "|=" ], [ "catass", "~=" ], // end of binary operators [ "not", "!" ], [ "dollar", "$" ], [ "slice", ".." ], [ "dotdotdot", "..." ], [ "plusplus", "++" ], [ "minusminus", "--" ], [ "question", "?" ], /+ [ "array", "[]" ], // symbols with duplicate meaning [ "address", "&" ], [ "star", "*" ], [ "preplusplus", "++" ], [ "preminusminus", "--" ], [ "neg", "-" ], [ "uadd", "+" ], [ "cat", "~" ], [ "identity", "is" ], [ "plus", "++" ], [ "minus", "--" ], +/ ]; string genOperatorEnum(T)(const string[2][] ops, T begin) { string enums = "enum { TOK_begin_Operators = " ~ to!string(begin) ~ ", "; bool first = true; for(int o = 0; o < ops.length; o++) { enums ~= "TOK_" ~ ops[o][0]; if(first) { first = false; enums ~= " = TOK_begin_Operators"; } enums ~= ","; } enums ~= "TOK_end_Operators }"; return enums; } mixin(genOperatorEnum(operators, "TOK_end_Keywords")); enum TOK_binaryOperatorFirst = TOK_xor; enum TOK_binaryOperatorLast = TOK_catass; enum TOK_assignOperatorFirst = TOK_assign; enum TOK_assignOperatorLast = TOK_catass; enum TOK_unorderedOperatorFirst = TOK_unord; enum TOK_unorderedOperatorLast = TOK_ug; enum TOK_error = -1; bool _stringEqual(string s1, string s2, int length) { if(s1.length < length || s2.length < length) return false; for(int i = 0; i < length; i++) if(s1[i] != s2[i]) return false; return true; } int[] sortedOperatorIndexArray() { // create sorted list of operators int[] opIndex; for(int o = 0; o < operators.length; o++) { string op = operators[o][1]; int p = 0; while(p < opIndex.length) { assert(op != operators[opIndex[p]][1], "duplicate operator " ~ op); if(op < operators[opIndex[p]][1]) break; p++; } // array slicing does not work in CTFE? // opIndex ~= opIndex[0..p] ~ o ~ opIndex[p..$]; int[] nIndex; for(int i = 0; i < p; i++) nIndex ~= opIndex[i]; nIndex ~= o; for(int i = p; i < opIndex.length; i++) nIndex ~= opIndex[i]; opIndex = nIndex; } return opIndex; } string[] sortedOperatorArray() { string[] array; foreach(o; sortedOperatorIndexArray()) array ~= operators[o][1]; return array; } string genOperatorParser(string getch) { int[] opIndex = sortedOperatorIndexArray(); int matchlen = 0; string indent = ""; string[] defaults = [ "error" ]; string txt = indent ~ "dchar ch;\n"; for(int o = 0; o < opIndex.length; o++) { string op = operators[opIndex[o]][1]; string nextop; if(o + 1 < opIndex.length) nextop = operators[opIndex[o+1]][1]; while(op.length > matchlen) { if(matchlen > 0) txt ~= indent ~ "case '" ~ op[matchlen-1] ~ "':\n"; indent ~= " "; txt ~= indent ~ "ch = " ~ getch ~ ";\n"; txt ~= indent ~ "switch(ch)\n"; txt ~= indent ~ "{\n"; indent ~= " "; int len = (matchlen > 0 ? matchlen - 1 : 0); while(len > 0 && defaults[len] == defaults[len+1]) len--; txt ~= indent ~ "default: len = " ~ to!string(len) ~ "; return TOK_" ~ defaults[$-1] ~ ";\n"; //txt ~= indent ~ "case '" ~ op[matchlen] ~ "':\n"; defaults ~= defaults[$-1]; matchlen++; } if(nextop.length > matchlen && nextop[0..matchlen] == op) { if(matchlen > 0) txt ~= indent ~ "case '" ~ op[matchlen-1] ~ "':\n"; indent ~= " "; txt ~= indent ~ "ch = " ~ getch ~ ";\n"; txt ~= indent ~ "switch(ch)\n"; txt ~= indent ~ "{\n"; indent ~= " "; txt ~= indent ~ "default: len = " ~ to!string(matchlen) ~ "; return TOK_" ~ operators[opIndex[o]][0] ~ "; // " ~ op ~ "\n"; defaults ~= operators[opIndex[o]][0]; matchlen++; } else { string case_txt = "case '" ~ op[matchlen-1] ~ "':"; if(isAlphaNum(op[matchlen-1])) case_txt ~= " ch = getch(); if(isAlphaNum(ch) || ch == '_') goto default;\n" ~ indent ~ " "; txt ~= indent ~ case_txt ~ " len = " ~ to!string(matchlen) ~ "; return TOK_" ~ operators[opIndex[o]][0] ~ "; // " ~ op ~ "\n"; while(nextop.length < matchlen || (matchlen > 0 && !_stringEqual(op, nextop, matchlen-1))) { matchlen--; indent = indent[0..$-2]; txt ~= indent ~ "}\n"; indent = indent[0..$-2]; defaults = defaults[0..$-1]; } } } return txt; } int parseOperator(S)(S txt, size_t pos, ref size_t len) { dchar getch() { if(pos >= txt.length) return 0; return decode(txt, pos); } mixin(genOperatorParser("getch()")); } //////////////////////////////////////////////////////////////////////// version(none) { pragma(msg, genKeywordsEnum(keywords, "TOK_end_Generic")); pragma(msg, genOperatorEnum(operators, "TOK_end_Keywords")); pragma(msg, sortedOperatorArray()); pragma(msg, genOperatorParser("getch()")); } string tokenString(int id) { switch(id) { case TOK_Space: return " "; case TOK_Comment: return "/**/"; case TOK_Identifier: return "Identifier"; case TOK_IntegerLiteral: return "IntegerLiteral"; case TOK_FloatLiteral: return "FloatLiteral"; case TOK_StringLiteral: return "StringtLiteral"; case TOK_CharacterLiteral: return "CharacterLiteral"; case TOK_EOF: return "__EOF__"; case TOK_RECOVER: return "__RECOVER__"; case TOK_begin_Keywords: .. case TOK_end_Keywords - 1: return keywords[id - TOK_begin_Keywords]; case TOK_begin_Operators: .. case TOK_end_Operators - 1: return operators[id - TOK_begin_Operators][1]; default: assert(false); } } string operatorName(int id) { switch(id) { case TOK_begin_Operators: .. case TOK_end_Operators - 1: return operators[id - TOK_begin_Operators][0]; default: assert(false); } } enum case_TOKs_BasicTypeX = q{ case TOK_bool: case TOK_byte: case TOK_ubyte: case TOK_short: case TOK_ushort: case TOK_int: case TOK_uint: case TOK_long: case TOK_ulong: case TOK_char: case TOK_wchar: case TOK_dchar: case TOK_float: case TOK_double: case TOK_real: case TOK_ifloat: case TOK_idouble: case TOK_ireal: case TOK_cfloat: case TOK_cdouble: case TOK_creal: case TOK_void: }; enum case_TOKs_TemplateSingleArgument = q{ case TOK_Identifier: case TOK_CharacterLiteral: case TOK_StringLiteral: case TOK_IntegerLiteral: case TOK_FloatLiteral: case TOK_true: case TOK_false: case TOK_null: case TOK___FILE__: case TOK___LINE__: }; // + case_TOKs_BasicTypeX;