前言
一个小Demo, 用于分析的源文件比较简单, 主要的部分都有, 扩展比较容易.
将正则表达式表示的模式构造为状态转换图. 在本文中只列举状态转换图.
数字的状态转换
保留字和ID的状态转换
运算符的状态转换
用于分析的源文件
结果
前情提要
一、词素模式
二、打印Token
三、StateTransition类
四、StateTransition的构造与析构函数
五、StateTransition普通函数的实现
六、运算符的状态转换
七、数字的状态转换
八、保留字和ID的状态转换
九、空格, 制表符, 换行符设置
十、调用
正文
将使用<~> 标记来自哪个文件
一、词素模式
<~Token.h>
namespace Lexical_Analysis {
enum Tag {
RESERVE_WORD = 256,
ID,
METHOD,
RELOP,
NUM,
};
enum RelopTag {
LT, // <
LE, // <=
EQ, // =
NE, // <>
GT, // >
GE, // >=
};
enum ReserveWordTag {
INT,
IF,
ELSE,
THEN
};
class Token {
public:
Tag tag;
explicit Token(Tag t): tag(t) { }
virtual ~Token() = default;
};
// 运算符值
class RelopToken : public Token {
public:
RelopTag relop;
explicit RelopToken(RelopTag tag): Token(RELOP), relop(tag) { }
};
// 保留字tag, 和词素
class ReserveWordToken : public Token {
public:
ReserveWordTag reserveWordTag;
std::string lexeme;
ReserveWordToken(ReserveWordTag t, std::string l): Token(RESERVE_WORD), reserveWordTag(t), lexeme(l) { }
};
// 存储id 值
class IdToken : public Token {
public:
std::string value;
explicit IdToken(std::string v): Token(ID), value(v) { }
};
// 存储int 值
class IntToken : public Token {
public:
int value;
explicit IntToken(int v): Token(NUM), value(v) { }
};
// 存储double 值
class DoubleToken : public Token {
public:
double value;
explicit DoubleToken(double v): Token(NUM), value(v) { }
};
}
二、打印Token
void Lexical_Analysis::StateTransition::print_token(Lexical_Analysis::Token *token) {
RelopToken* relopToken;
ReserveWordToken* reserveWordToken;
IdToken* idToken;
IntToken* intToken;
DoubleToken* doubleToken;
if ((relopToken = dynamic_cast<RelopToken*>(token))) {
std::cout << "<" << getRelopTagStr(relopToken->relop) << ">" << std::endl;
} else if ((reserveWordToken = dynamic_cast<ReserveWordToken*>(token))) {
std::cout << "<" << getTagStr(reserveWordToken->tag) << ", " << getReserveWordTagStr(reserveWordToken->reserveWordTag) << ">" << std::endl;
} else if ((idToken = dynamic_cast<IdToken*>(token))) {
std::cout << "<id, " << idToken->value << ">" << std::endl;
} else if ((intToken = dynamic_cast<IntToken*>(token))) {
std::cout << "<int, " << intToken->value << ">" << std::endl;
} else if ((doubleToken = dynamic_cast<DoubleToken*>(token))) {
std::cout << "<double, " << doubleToken->value << ">" << std::endl;
}
}
std::string Lexical_Analysis::StateTransition::getTagStr(Lexical_Analysis::Tag val) {
if (val == Tag::RESERVE_WORD) return "RESERVE_WORD";
else if (val == Tag::ID) return "ID";
else if (val == Tag::RELOP) return "RELOP";
else if (val == Tag::METHOD) return "METHOD";
else if (val == Tag::NUM) return "NUM";
return std::__cxx11::string();
}
std::string Lexical_Analysis::StateTransition::getRelopTagStr(Lexical_Analysis::RelopTag val) {
if (val == RelopTag::LT) return "<";
else if (val == RelopTag::LE) return "<=";
else if (val == RelopTag::EQ) return "=";
else if (val == RelopTag::NE) return "<>";
else if (val == RelopTag::GT) return ">";
else if (val == RelopTag::GE) return ">=";
return std::__cxx11::string();
}
std::string Lexical_Analysis::StateTransition::getReserveWordTagStr(Lexical_Analysis::ReserveWordTag val) {
if (val == ReserveWordTag::INT) return "INT";
else if (val == ReserveWordTag::IF) return "IF";
else if (val == ReserveWordTag::ELSE) return "ELSE";
else if (val == ReserveWordTag::THEN) return "THEN";
return std::__cxx11::string();
}
三、StateTransition类
<~StateTransition.h>
namespace Lexical_Analysis {
class StateTransition {
public:
explicit StateTransition(Buffer<>* _buffer);
~StateTransition();
private:
Buffer<>* buffer; // 缓冲区
std::map<std::string, ReserveWordToken*> reserveWords; // 记号表, 见构造函数初始化
private:
/**
* 将token存入记号表
* @param token
*/
void reserve(ReserveWordToken* token);
public:
/**
* 运算符的状态转换实现
*/
Token* getRelop();
/**
* 数字的状态转换实现
*/
Token* getNumber();
/**
* 字符序列的状态转换实现
*/
Token* getCharacterSequence();
/**
* 更新空格 制表符 换行
*/
void getBlankTabNewline();
/**
* 全局恢复策略, 将forward值重置为lexemeBegin的值
* 使用另一个状态图从尚未处理的输入部分的真实位置开始识别
*/
void fail();
/**
* 回退一个字符
*/
void retract();
public:
void print_token(Token* token);
std::string getTagStr(Tag val);
std::string getRelopTagStr(RelopTag val);
std::string getReserveWordTagStr(ReserveWordTag val);
};
}
四、StateTransition的构造与析构函数
<~StateTransition.cpp>
Lexical_Analysis::StateTransition::StateTransition(Lexical_Analysis::Buffer<> *_buffer):buffer(_buffer) {
reserve(new ReserveWordToken(ReserveWordTag::INT, "int"));
reserve(new ReserveWordToken(ReserveWordTag::IF, "if"));
reserve(new ReserveWordToken(ReserveWordTag::ELSE, "else"));
reserve(new ReserveWordToken(ReserveWordTag::THEN, "then"));
}
Lexical_Analysis::StateTransition::~StateTransition() {
}
五、StateTransition普通函数的实现
<~StateTransition.cpp>
void Lexical_Analysis::StateTransition::fail() {
buffer->forward = buffer->lexemeBegin;
}
void Lexical_Analysis::StateTransition::retract() {
buffer->pre();
}
void Lexical_Analysis::StateTransition::reserve(Lexical_Analysis::ReserveWordToken *token) {
reserveWords[token->lexeme] = token;
}
六、运算符的状态转换, 在下文的三个状态转换中, 都首先检查了当前字符是否满足当前状态转换需要的条件, 如果不满足则直接返回. 然后则是一个while循环与多路分支语句.
<~StateTransition.cpp>
Lexical_Analysis::Token* Lexical_Analysis::StateTransition::getRelop() {
int state = 0;
char c = buffer->cur();
bool is_loop = (c == '<' || c == '=' || c == '>');
if (!is_loop) return nullptr;
while (is_loop) {
switch (state) {
case 0:
c = buffer->next();
if (c == '<') state = 1;
else if (c == '=') state = 5;
else if (c == '>') state = 6;
break;
case 1:
c = buffer->next();
if (c == '=') state = 2;
else if (c == '>') state = 3;
else state = 4;
break;
case 2:
// 需要将lexemeBegin指针设置为当前词素之后的第一个字符,
// 因此使forward向前移动, 更改lexemeBegin指针后在回退
buffer->next();
buffer->lexemeBegin = buffer->forward;
retract();
return new RelopToken(RelopTag::LE);
case 3:
buffer->next();
buffer->lexemeBegin = buffer->forward;
retract();
return new RelopToken(RelopTag::NE);
case 4:
buffer->lexemeBegin = buffer->forward;
retract();
return new RelopToken(RelopTag::LT);
case 5:
buffer->next();
buffer->lexemeBegin = buffer->forward;
retract();
return new RelopToken(RelopTag::EQ);
case 6:
c = buffer->next();
if (c == '=') state = 7;
else state = 8;
break;
case 7:
buffer->next();
buffer->lexemeBegin = buffer->forward;
retract();
return new RelopToken(RelopTag::GE);
case 8:
// 当前forward指针已经指向当前词素之后的第一个位置, 不用更改
buffer->lexemeBegin = buffer->forward;
retract();
return new RelopToken(RelopTag::GT);
}
}
}
七、数字的状态转换, 包括小数位
<~StateTransition.cpp>
void calculation(int& result, int val) {
result = result * 10 + val;
}
Lexical_Analysis::Token* Lexical_Analysis::StateTransition::getNumber() {
int state = 12;
bool is_loop = is_digit(buffer->cur());
/* 直接返回 */
if (!is_loop) return nullptr;
int val = 0; // 整数部分
double dVal = 0.0; // 小数部分
int place = 10; // 小数点后第一位/10, 第二位/100 ...
bool isDecimal = false; // 是否为小数
int ePos = 0; // E 后面的数 (E06)
bool isNegative = false; // E后面的数是否为负数 (E-03)
char c;
while (is_loop) {
switch (state) {
case 12:
c = buffer->next();
if (isdigit(c)) {
calculation(val, to_digit_10(c));
state = 13;
}
break;
case 13:
c = buffer->next();
if (isdigit(c)) {
calculation(val, to_digit_10(c));
state = 13;
}
else if (c == '.') {
isDecimal = true;
state = 14;
}
else if (c == 'E') state = 16;
else state = 19;
break;
case 14:
c = buffer->next();
if (isdigit(c)) {
double d = 1.0 * to_digit_10(c) / place;
dVal += d;
place *= 10;
state = 15;
}
else state = 19;
break;
case 15:
c = buffer->next();
if (isdigit(c)) {
double d = 1.0 * to_digit_10(c) / place;
dVal += d;
place *= 10;
state = 15;
}
else if (c == 'E') state = 16;
else state = 19;
break;
case 16:
c = buffer->next();
if (isdigit(c)) {
calculation(ePos, to_digit_10(c));
state = 18;
}
else if (c == '+' || c == '-') {
if (c == '-') isNegative = true;
state = 17;
}
break;
case 17:
c = buffer->next();
if (isdigit(c)) {
calculation(ePos, to_digit_10(c));
state = 18;
}
break;
case 18:
c = buffer->next();
if (isdigit(c)) {
calculation(ePos, to_digit_10(c));
state = 18;
}
else state = 19;
break;
case 19:
is_loop = false;
break;
}
}
// 使lexemeBegin指向刚才找到的词素之后的第一个字符, 处理完词素后, forward回退一个位置
char* tmp = buffer->forward;
if (isDecimal) { // 如果是小数
dVal += val;
if (isNegative) for (int i = 0; i < ePos; i++) dVal /= 10;
else for (int i = 0; i < ePos; i++) dVal *= 10;
// 回退, 并设置lexemeBegin值
retract();
buffer->lexemeBegin = tmp;
DoubleToken* token = new DoubleToken(dVal);
return token;
} else {
if (isNegative) for (int i = 0; i < ePos; i++) val /= 10;
else for (int i = 0; i < ePos; i++) val *= 10;
retract();
buffer->lexemeBegin = tmp;
return new IntToken(val);
}
}
八、保留字和ID的状态转换
<~StateTransition.cpp>
Lexical_Analysis::Token* Lexical_Analysis::StateTransition::getCharacterSequence() {
int state = 9;
bool is_loop = is_letter(buffer->cur());
/* 直接返回 */
if (!is_loop) return nullptr;
char c;
while (is_loop) {
switch (state) {
case 9:
c = buffer->next();
if (is_underline(c) || is_letter(c)) state = 10;
break;
case 10:
c = buffer->next();
if (is_letter(c) || is_digit(c) || is_underline(c)) state = 10;
else state = 11;
break;
case 11:
is_loop = false;
break;
}
}
// 使lexemeBegin指向刚才找到的词素之后的第一个字符, 处理完词素后, forward回退一个位置
char* tmp = buffer->forward;
if (!buffer->is_end) {
// 如果没有读到结尾, 执行回退
retract();
}
std::string result = buffer->getString();
buffer->lexemeBegin = tmp;
// 从符号表中查找是否为关键词(保留字), 如果是则返回关键词token, 否则返回id token
auto it = reserveWords.find(result);
if (it != reserveWords.end()) {
return (*it).second;
}
return new IdToken(result);
}
九、空格, 制表符, 换行符设置
<~StateTransition.cpp>
void Lexical_Analysis::StateTransition::getBlankTabNewline() {
bool is_loop = is_blank_tab_newline(buffer->cur());
if (!is_loop) return ;
while (is_blank_tab_newline(buffer->cur())) {
buffer->next();
}
buffer->lexemeBegin = buffer->forward;
}
十、调用
<~main.cpp>
int main() {
string fileStr = "/home/yanuas/CompilingPrinciple/LexicalAnalysis/code.src";
Buffer<1024> buffer(fileStr);
StateTransition transition(&buffer);
while (1) {
Token* token = transition.getRelop();
if (token != nullptr) transition.print_token(token);
token = transition.getCharacterSequence();
if (token != nullptr) transition.print_token(token);
token = transition.getNumber();
if (token != nullptr) transition.print_token(token);
transition.getBlankTabNewline();
if (transition.buffer->is_end) break;
}
return 0;
}