Crevice: src/language/Tokenizer.cc Source File

 /* 
  * File:   Tokenizer.cc
  * 
  * Copyright 2014 Heinrich Schuchardt <xypron.glpk@gmx.de>
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
  * You may obtain a copy of the License at
  * 
  *   http://www.apache.org/licenses/LICENSE-2.0
  * 
  * Unless required by applicable law or agreed to in writing, software
  * distributed under the License is distributed on an "AS IS" BASIS,
  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
  * See the License for the specific language governing permissions and
  * limitations under the License.
  */
 
 #include <iostream>
 #include <sstream>
 #include <string>
 #include "Tokenizer.h"
 #include "Ustring.h"
 
 const std::string Tokenizer::SPEC_CHARS
 ("!\"#$%&'()*+,-./:;<=>?@[\\]^`{|}~");
 
 Tokenizer::Tokenizer(Ustring &str) {
     Ustring *ustr = NULL;
     try {
         ustr = new Ustring();
         for (int i = 0;;) {
             const Uchar u = str.charAt(i);
             if (u <= ' ') {
                 // white space
                 ++i;
             } else if (u == '#') {
                 // comment
                 while (str.charAt(i) != '\n') {
                     ++i;
                 }
             } else if (u == '"') {
                 parseString(str, i);
             } else if (u >= '0' && u <= '9') {
                 parseNumber(str, i);
             } else if (u < 0x80 && SPEC_CHARS.find((char) u)
                     != std::string::npos) {
                 parseOperator(str, i);
                 continue;
             } else {
                 parseId(str, i);
             }
         }
     } catch (const Ustring::Error &e) {
     }
     delete ustr;
 }
 
 Tokenizer::~Tokenizer() {
 }
 
 void Tokenizer::add(Token *tok) {
     tokenlist.append(tok);
 }
 
 void Tokenizer::parseBlockComment(Ustring &s, int &i) {
     for (;;) {
         Uchar u = s.charAt(++i);
         if (u == '*') {
             u = s.charAt(++i);
             if (u == '/') {
                 return;
             }
         }
     }
 }
 
 void Tokenizer::parseId(Ustring &s, int &i) {
     Ustring ustr;
     Uchar u;
     Token *t = NULL;
 
     for (;; ++i) {
         u = s.charAt(i);
         if (u <= ' ' || Tokenizer::SPEC_CHARS.find((char) u)
                 != std::string::npos) {
             break;
         }
         ustr += u;
     }
 
     t = new Token(Token::ID, ustr);
     add(t);
 }
 
 void Tokenizer::parseNumber(Ustring &s, int &i) {
     Error error;
     double value = 0;
     bool expneg = false;
     int exponent = 0;
     bool dp = false;
     int decimals = 0;
     Uchar u0;
     Token *t;
 
     for (;; ++i) {
         u0 = s.charAt(i);
         if (u0 == '.') {
             Uchar u1 = s.charAt(++i);
             --i;
             if (u1 == '.') {
                 --i;
                 parseOperator(s, i);
                 return;
             }
             if (dp) {
                 throw error;
             }
             dp = true;
             continue;
         } else if (u0 < '0' || u0 > '9') {
             break;
         }
         value *= 10.;
         value += u0 - '0';
         if (dp) {
             ++decimals;
         }
     }
     if (u0 == 'E' || u0 == 'e') {
         u0 = s.charAt(++i);
         if (u0 == '-') {
             expneg = true;
             u0 = s.charAt(++i);
         } else if (u0 == '+') {
             u0 = s.charAt(++i);
         }
         for (;; ++i) {
             u0 = s.charAt(i);
             if (u0 < '0' || u0 > '9') {
                 break;
             }
             exponent *= 10;
             if (expneg) {
                 exponent -= u0 - '0';
             } else {
                 exponent += u0 - '0';
             }
         }
     }
     exponent -= decimals;
     if (expneg == false) {
         for (int i = 0; i < exponent; ++i) {
             value *= 10.;
         }
     } else {
         for (int i = 0; i > exponent; --i) {
             value /= 10.;
         }
     }
 
     t = new Token(value);
     add(t);
 }
 
 void Tokenizer::parseOperator(Ustring &s, int &i) {
     Ustring ustr;
     Uchar u0 = s.charAt(i);
     Uchar u1 = s.charAt(++i);
     Token *t;
 
     ustr += u0;
     switch (u0) {
         case '.':
             if (u1 == '.') {
                 ustr += u1;
                 ++i;
             } else if (u1 >= '0' && u1 <= '9') {
                 --i;
                 parseNumber(s, i);
                 return;
             }
             break;
         case '/':
             if (u1 == '*') {
                 parseBlockComment(s, i);
                 return;
             }
             break;
         case ':':
         case '=':
             if (u1 == '=') {
                 ustr += u1;
                 ++i;
             }
             break;
         case '|':
             if (u1 == '|') {
                 ustr += u1;
                 ++i;
             }
             break;
         case '&':
             if (u1 == '&') {
                 ustr += u1;
                 ++i;
             }
             break;
     }
     t = new Token(Token::OPERATOR, ustr);
     add(t);
 }
 
 void Tokenizer::parseString(Ustring &s, int& i) {
     Error error;
     Ustring ustr;
     Uchar u;
     Token *t;
 
     for (;;) {
         u = s.charAt(++i);
         if (u < ' ') {
             throw error;
         } else if (u == '"') {
             ++i;
             break;
         } else if (u == '\\') {
             u = s.charAt(++i);
             if (u < ' ') {
                 throw error;
             }
             switch (u) {
                 case '"':
                     ustr += '"';
                     break;
                 case '\\':
                     ustr += '\\';
                     break;
                 case 'n':
                     ustr += '\n';
                     break;
                 case 'r':
                     ustr += '\r';
                     break;
                 case 't':
                     ustr += '\t';
                     break;
                 case 'u':
                 {
                     Uchar e = 0;
                     for (int j = 0;; j++) {
                         u = s.charAt(++i);
                         if (u >= '0' && u <= '9') {
                             u -= '0';
                         } else if (u >= 'A' && u <= 'F') {
                             u -= 'A' - 10;
                         } else if (u >= 'a' && u <= 'f') {
                             u -= 'a' - 10;
                         } else if (u == ';') {
                             break;
                         } else {
                             throw error;
                         }
                         e <<= 4;
                         e += u;
                         if (e > 0x0010FFFFu) {
                             throw error;
                         }
                     }
                     ustr += e;
                     break;
                 }
                 case 'x':
                 {
                     Uchar e = 0;
                     for (int j = 0; j < 2; j++) {
                         u = s.charAt(++i);
                         if (u >= '0' && u <= '9') {
                             u -= '0';
                         } else if (u >= 'A' && u <= 'F') {
                             u -= 'A' - 10;
                         } else if (u >= 'a' && u <= 'f') {
                             u -= 'a' - 10;
                         } else {
                             throw error;
                         }
                         e <<= 4;
                         e += u;
                     }
                     ustr += e;
                     break;
                 }
                 default:
                     throw error;
             }
         } else {
             ustr += u;
         }
     }
 
     t = new Token(Token::STRING, ustr);
     add(t);
 }
 
 std::string Tokenizer::toString() {
     std::stringstream s;
     for (TokenList::iterator it = tokenlist.begin();
             it != tokenlist.end(); ++it) {
         std::string str;
         **it >> str;
         str += " ";
         s << str;
     }
     return s.str();
 }