自己动手写C语言编译器(3)
词法分析器部分完成。
支持:
1.支持单词分割
2.支持数字类型
3.支持字符串
4.支持换行
6.支持注释
?
不支持:
1.不支持关键字
2.不支持变量。
3.不支持关键字。
4.不支操作符。
?
偶没有被那些个编译原理课程所吓倒。。。。。真的勇士,只管前行!
?
?
?
#ifndef _ISTREAMTOKENIZER_H_#define _ISTREAMTOKENIZER_H_#include <limits.h>#include <string>#include <istream>#include <vector>#define _COUNT_OF(a) (sizeof(a)/sizeof(a[0]))class IstreamTokenizer { private:/*** The next character to be considered by the nextToken method. May also* be NEED_CHAR to indicate that a new character should be read, or SKIP_LF* to indicate that a new character should be read and, if it is a '\n'* character, it should be discarded and a second new character should be* read.*/static const int SKIP_LF;static const int NEED_CHAR;//字符类型static const unsigned char CT_WHITESPACE;static const unsigned char CT_DIGIT;static const unsigned char CT_ALPHA;static const unsigned char CT_QUOTE;static const unsigned char CT_COMMENT;public://token类型static const int TT_EOF;static const int TT_EOL;static const int TT_NUMBER;static const int TT_WORD;static const int TT_NOTHING;private:std::istream& input;std::vector<char> buf;int peekc; bool pushedBack;bool forceLower;int LINENO;bool eolIsSignificantP;bool slashSlashCommentsP;bool slashStarCommentsP;unsigned char ctype[256];public:std::string sval;double nval;int ttype;private:void init() {wordChars('a', 'z');wordChars('A', 'Z');wordChars(128 + 32, 255);whitespaceChars(0, ' ');commentChar('/');quoteChar('"');quoteChar('\'');parseNumbers();} public:IstreamTokenizer(std::istream& is): input(is), peekc(NEED_CHAR){init();}void resetSyntax() {for (int i = _COUNT_OF(ctype); --i >= 0;)ctype[i] = 0;}void wordChars(int low, int hi) {if (low < 0)low = 0;if (hi >= _COUNT_OF(ctype))hi = _COUNT_OF(ctype) - 1;while (low <= hi)ctype[low++] |= CT_ALPHA;}void whitespaceChars(int low, int hi) {if (low < 0)low = 0;if (hi >= _COUNT_OF(ctype))hi = _COUNT_OF(ctype) - 1;while (low <= hi)ctype[low++] = CT_WHITESPACE;}void ordinaryChars(int low, int hi) {if (low < 0)low = 0;if (hi >= _COUNT_OF(ctype))hi = _COUNT_OF(ctype) - 1;while (low <= hi)ctype[low++] = 0;}void ordinaryChar(int ch) {if (ch >= 0 && ch < _COUNT_OF(ctype))ctype[ch] = 0;}void commentChar(int ch) {if (ch >= 0 && ch < _COUNT_OF(ctype))ctype[ch] = CT_COMMENT;}void quoteChar(int ch) {if (ch >= 0 && ch < _COUNT_OF(ctype))ctype[ch] = CT_QUOTE;}void parseNumbers() {for (int i = '0'; i <= '9'; i++)ctype[i] |= CT_DIGIT;ctype['.'] |= CT_DIGIT;ctype['-'] |= CT_DIGIT;}/*** Determines whether or not ends of line are treated as tokens.* If the flag argument is true, this tokenizer treats end of lines* as tokens; the <code>nextToken</code> method returns* <code>TT_EOL</code> and also sets the <code>ttype</code> field to* this value when an end of line is read.* <p>* A line is a sequence of characters ending with either a* carriage-return character (<code>'\r'</code>) or a newline* character (<code>'\n'</code>). In addition, a carriage-return* character followed immediately by a newline character is treated* as a single end-of-line token.* <p>* If the <code>flag</code> is false, end-of-line characters are* treated as white space and serve only to separate tokens.** @param flag <code>true</code> indicates that end-of-line characters* are separate tokens; <code>false</code> indicates that* end-of-line characters are white space.* @see java.io.StreamTokenizer#nextToken()* @see java.io.StreamTokenizer#ttype* @see java.io.StreamTokenizer#TT_EOL*/void eolIsSignificant(bool flag) {eolIsSignificantP = flag;}void slashStarComments(bool flag) {slashStarCommentsP = flag;}void slashSlashComments(bool flag) {slashSlashCommentsP = flag;}void lowerCaseMode(bool fl) {forceLower = fl;}/** Read the next character */private:int read() {return input.get();}int nextToken() {if (pushedBack) {pushedBack = false;return ttype;}unsigned char* ct = ctype;int c = peekc;if (c < 0)c = NEED_CHAR;if (c == SKIP_LF) {c = read();if (c < 0)return ttype = TT_EOF;if (c == '\n')c = NEED_CHAR;}if (c == NEED_CHAR) {c = read();if (c < 0)return ttype = TT_EOF;}ttype = c;/* Just to be safe *//* Set peekc so that the next invocation of nextToken will read* another character unless peekc is reset in this invocation*/peekc = NEED_CHAR;int ctype = c < 256 ? ct[c] : CT_ALPHA;while ((ctype & CT_WHITESPACE) != 0) {if (c == '\r') {LINENO++;if (eolIsSignificantP) {//end of line 作为结束的标识。peekc = SKIP_LF;return ttype = TT_EOL;}c = read();if (c == '\n')c = read();} else {if (c == '\n') {LINENO++;if (eolIsSignificantP) {//end of line 作为结束的标识。return ttype = TT_EOL;}}c = read();}if (c < 0)return ttype = TT_EOF;ctype = c < 256 ? ct[c] : CT_ALPHA;}if ((ctype & CT_DIGIT) != 0) {bool neg = false;if (c == '-') {c = read();if (c != '.' && (c < '0' || c > '9')) {peekc = c;return ttype = '-';}neg = true;}double v = 0;int decexp = 0;int seendot = 0;while (true) {if (c == '.' && seendot == 0)seendot = 1;else if ('0' <= c && c <= '9') {v = v * 10 + (c - '0');decexp += seendot;} elsebreak;c = read();}peekc = c;if (decexp != 0) {double denom = 10;decexp--;while (decexp > 0) {denom *= 10;decexp--;}/* Do one division of a likely-to-be-more-accurate number */v = v / denom;}nval = neg ? -v : v;return ttype = TT_NUMBER;}if ((ctype & CT_ALPHA) != 0) {int i = 0;do {if (i >= buf.size()) {buf.resize(buf.size()*2);}buf[i++] = (char) c;c = read();ctype = c < 0 ? CT_WHITESPACE : c < 256 ? ct[c] : CT_ALPHA;} while ((ctype & (CT_ALPHA | CT_DIGIT)) != 0);peekc = c;sval.resize(i, 0);std::copy(buf.begin(), buf.end(), sval.begin());return ttype = TT_WORD;}if ((ctype & CT_QUOTE) != 0) {ttype = c;int i = 0;int d = read();while (d >= 0 && d != ttype && d != '\n' && d != '\r') {if (d == '\\') {c = read();int first = c; /* To allow \377, but not \477 */if (c >= '0' && c <= '7') {c = c - '0';int c2 = read();if ('0' <= c2 && c2 <= '7') {c = (c << 3) + (c2 - '0');c2 = read();if ('0' <= c2 && c2 <= '7' && first <= '3') {c = (c << 3) + (c2 - '0');d = read();} elsed = c2;} elsed = c2;} else {switch (c) {case 'a':c = 0x7;break;case 'b':c = '\b';break;case 'f':c = 0xC;break;case 'n':c = '\n';break;case 'r':c = '\r';break;case 't':c = '\t';break;case 'v':c = 0xB;break;}d = read();}} else {c = d;d = read();}if (i >= buf.size()) {buf.resize(buf.size()*2);}buf[i++] = (char)c;}/* If we broke out of the loop because we found a matching quote* character then arrange to read a new character next time* around; otherwise, save the character.*/peekc = (d == ttype) ? NEED_CHAR : d;buf.resize(i);std::copy(buf.begin(), buf.end(), sval.begin());return ttype;}if (c == '/' && (slashSlashCommentsP || slashStarCommentsP)) {c = read();if (c == '*' && slashStarCommentsP) {int prevc = 0;while ((c = read()) != '/' || prevc != '*') {if (c == '\r') {LINENO++;c = read();if (c == '\n') {c = read();}} else {if (c == '\n') {LINENO++;c = read();}}if (c < 0)return ttype = TT_EOF;prevc = c;}return nextToken();} else if (c == '/' && slashSlashCommentsP) {while ((c = read()) != '\n' && c != '\r' && c >= 0);peekc = c;return nextToken();} else {/* Now see if it is still a single line comment */if ((ct['/'] & CT_COMMENT) != 0) {while ((c = read()) != '\n' && c != '\r' && c >= 0);peekc = c;return nextToken();} else {peekc = c;return ttype = '/';}}}if ((ctype & CT_COMMENT) != 0) {while ((c = read()) != '\n' && c != '\r' && c >= 0);peekc = c;return nextToken();}return ttype = c; } void pushBack() { if (ttype != TT_NOTHING)pushedBack = true; } int lineno() {return LINENO; } std::string toString();};const unsigned char IstreamTokenizer::CT_WHITESPACE = 1;const unsigned char IstreamTokenizer::CT_DIGIT = 2;const unsigned char IstreamTokenizer::CT_ALPHA = 4;const unsigned char IstreamTokenizer::CT_QUOTE = 8;const unsigned char IstreamTokenizer::CT_COMMENT = 16;const int IstreamTokenizer::NEED_CHAR = INT_MAX; const int IstreamTokenizer::SKIP_LF = INT_MAX - 1;#endif? 1 楼 vb2005xu 2011-07-29 希望再接再厉 我以前也想弄这个玩意 2 楼 jcs130 2011-07-29 额……真厉害……我以后也试试…… 3 楼 rundout 2011-07-30 牛人,希望你勇往直前