编译原理之正则表达式转NFA
本⽂转载⾃
输⼊⼀个正则表达式,输出⼀个NFA。
我的做法:输⼊⼀个字符串表⽰正则,输出则是把输出到⼀个.dot⽂件中并将dot⽂件编译成pdf,fedora需要sudo yum install dot,然后evince XXX.pdf就可以查看⽣成的NFA了。
具体算法是按照龙书上的Tompson算法来的。
废话不多说,放码过来:
/*
Author:ChrisZZ(zchrissirhcz@gmail)
Time:2013-12-25 14:13:09
输⼊:正则表达式
输出:⾃动机
算法步骤:
1.把正则表达式转化为后缀表达式
2.把后缀表达式转化为NFA
3.⽤dot语⾔把NFA输出到PDF
参考:
1.Regular Expression Matching Can Be Simple And Fast
swtch/~rsc/regexp/regexp1.html
2.龙书 chap
3.7.4 从正则表达式构造NFA
3.YCC学长的project中dot语⾔的使⽤
其他说明:
1.需要安装dot,并添加到系统path中
2.在windows下运⾏时,控制台因为编码不⽀持可能导致中⽂提⽰⽆法显⽰
*/
#include <iostream>
#include <string>
#include <stdio.h>
#include <stack>
#include <string.h>
#include <stdexcept>
#include <stdlib.h>
using namespace std;
const int Match = 256;
const int Split = 257;//表⽰epsilon分⽀
struct Paren{//括号结构体
int natom;
int nalt;
};
string re2post(string re){
Paren paren;//括号
stack<struct Paren>parenStk;
string postExpr="";
int i, len=re.length();
int nalt=0, natom=0;
const string invalidRegExp = "⾮法的正则表达式";
for(i=0; i<len; i++){
if(isspace(re[i])) continue;
if(isalpha(re[i])){
if(natom>1){
natom--;
postExpr = postExpr + '.';
}
natom++;
postExpr = postExpr + re[i];
}
else if(re[i]=='('){
if(natom>1){
postExpr = postExpr + '.';
}
paren.natom = natom;
paren.nalt = nalt;
parenStk.push(paren);
nalt = 0;
natom = 0;
}
else if(re[i]==')'){
if(natom==0 || pty())
throw runtime_error(invalidRegExp+":括号不匹配");
while(--natom>0){//⽐如((a|b)(c|d))模式,当上⼀次匹配完倒数第⼆个右括号后,natom为2,需要添加'.'
postExpr = postExpr + '.';
}
while(nalt-->0){
postExpr = postExpr + '|';
}
p();
parenStk.pop();
natom = paren.natom;
nalt = paren.nalt;
natom++;
}
else if(re[i]=='*'){
if(natom==0)
throw runtime_error(invalidRegExp+":提前出现'*'");
postExpr = postExpr + re[i];
}
else if(re[i]=='|'){
if(natom==0) throw runtime_error(invalidRegExp+":提前出现'|'"); while(--natom>0){
postExpr = postExpr + '.';
}
nalt++;
}
else
throw runtime_error(invalidRegExp);
}
if(!pty())
throw runtime_error(invalidRegExp+":括号不匹配");
while(--natom>0){
postExpr = postExpr + '.';
}
while(nalt-->0){
postExpr = postExpr + '|';
}
return postExpr;
}
class NFA;
/*
* c<256表⽰edge权重为c;
* c=256表⽰终结状态,匹配成功
* c=257表⽰分⽀(split)
*/
class State{
friend class NFA;
friend void nfa2graph(State* head, const string& re);
public:
State(int c=256, State* out=NULL, State* out1=NULL){
this->c = c;
this->out = out;
this->out1 = out1;
this->id = 0;
}
void setId(int id){
this->id = id;
}
private:
int c;
int id;//状态的编号
State* out;//从本状态出去的状态集合的头指针
State* out1;//两个分⽀的情况
};
class NFA{
public:
NFA(){
head = NULL;
tail = NULL;
}
NFA(const int& c){
tail = new State(Match, NULL, NULL);
head = new State(c, tail, NULL);
}
void doCat(NFA& nfa){
tail->out = nfa.head;
tail->c = Split;
tail = nfa.tail;
nfa.head = NULL;
nfa.tail = NULL;
}
void doUnion(NFA& nfa){
State* newHead = new State(Split, head, nfa.head);
State* newTail = new State(Match, NULL, NULL);
tail->c = Split;
tail->out = newTail;
nfa.tail->c = Split;
nfa.tail->out = newTail;
tail = newTail;
head = newHead;
nfa.head = NULL;
nfa.tail = NULL;
}
void doStar(){
State* newTail = new State(Match, NULL, NULL);
State* newHead = new State(Split, head, newTail);
tail->c = Split;
tail->out = newTail;
tail->out1 = head;
tail = newTail;
head = newHead;
}
void nfa2graph(const string& re){
char myfile[100];
printf("请输⼊⼀个⽂件名,⽤来保存⽣成的NFA-graph(不必提供后缀):\n"); scanf("%s", myfile);
printf("已将DOT⽂件存储在\"%s.dot\",\n", myfile);
printf("PDF⽂件则存储在\"%s.dot.pdf\".\n", myfile);
int i;
while(myfile[i]!='\0')
i++;
myfile[i] = '.';
myfile[i+1] = 'd';
myfile[i+2] = 'o';
myfile[i+3] = 't';
myfile[i+4] = '\0';
FILE *file = fopen(myfile, "w");
fputs("digraph {\n", file);
fputs("\t\"", file);
int len=re.length();
for(i=0; i<len; i++){
fprintf(file, "%c", re[i]);
}
fputs("\" [shape = plaintext]\n", file);
fputs("\trankdir = LR\n", file);
fputs("\t\"\" [shape = point]\n", file);
fputs("\t\"\" -> 1 [label = Start]\n\n", file);
int id = 1;
char circle[2000];
memset(circle, 0, sizeof(circle));
State* p;
stack<State*> staStk;
head->setId(id++);
staStk.push(head);
while(!pty()){
p = p();
staStk.pop();
char flag = 1;
cout << "p->c=" << p->c << endl;
if(p->c < Match){
cout << "p->out->id=" << p->out->id << endl;
if(p->out->id==0){
p->out->id = id++;
cout << "id=" << id << endl; }
else
flag = 0;
fprintf(file, "\t%d -> %d [label = \"%c\"]\n", p->id, (p->out)->id, p->c); State *what = p->out;
if(flag) //push(*what);
staStk.push(what);
} else if(p->c == Match){
circle[p->id] = 1;
} else{ //对应Split的情形
if(p->out->id==0)
p->out->id = id++;
else
flag = 0;
fprintf(file, "\t%d -> %d [label = <ε>]\n", p->id, p->out->id);
State *what = p->out;
if(flag) staStk.push(what);
if(p->out1!=NULL){
flag = 1;
if(p->out1->id==0)
p->out1->id = id++;
else
flag = 0;
fprintf(file, "\t%d -> %d [label = <ε>]\n", p->id, p->out1->id); what = p->out1;
if(flag) staStk.push(what);
}
}
}
for(i=1; i<id; i++){
fprintf(file, "\t%d [shape = circle", i);
if(circle[i])
fputs(", peripheries = 2", file);
fprintf(file, "]\n");
}
fputs("}", file);
fclose(file);
char cmd[108];
sprintf(cmd, "dot %s -O -Tpdf", myfile);
if(system(cmd)==0){
正则匹配原理printf("成功⽣成pdf图像!\n");
//printf("Linux⽤户可以使⽤evince file.pdf &命令打开~\n");
}
else
printf("悲剧!⽣成pdf图像时出现错误..\n");
}
private:
State* head;
State* tail;
};
NFA post2nfa(const string& postExpr){
stack<NFA> nfaStk;
NFA e1, e2, e;
int i, len=postExpr.length();
for(i=0; i<len; i++){
switch(postExpr[i]){
case '.':
e2 = p();
nfaStk.pop();
e1 = p();
nfaStk.pop();
e1.doCat(e2);
nfaStk.push(e1);
break;
case '|':
e2 = p();
nfaStk.pop();
e1 = p();
nfaStk.pop();
e1.doUnion(e2);
nfaStk.push(e1);
break;
case '*':
e = p();
nfaStk.pop();
e.doStar();
nfaStk.push(e);
break;
default://
NFA alpha(postExpr[i]);
nfaStk.push(alpha);
}
}
e = p();
nfaStk.pop();
if(!pty())
throw runtime_error("未知错误");
return e;
}
int main(){
string re;
while(true){
cout << "请输⼊⼀个正则表达式:\n";
cin >> re;
string postExpr = re2post(re);
cout << "postExpr is : " << postExpr << endl; NFA nfa = post2nfa(postExpr);
nfa.nfa2graph(re);
cout << "继续吗?(y/n)\n" << endl;
char c;
cin >> c;
while(c!='y' && c!='n'){
cout << "请输⼊'y'或'n'.\n";
c=getchar();
}
if(c=='n')
break;
}
cout << "Bye~\n";
return 0;
}
版权声明:本站内容均来自互联网,仅供演示用,请勿用于商业和其他非法用途。如果侵犯了您的权益请与我们联系QQ:729038198,我们将在24小时内删除。
发表评论