学习编译原理的时候做的一个小型LEX,及词法分析产生器,可以根据输入的正则表达式生成自动机,从中识别出代码中的token串,存入一个二维表,也就是SymbolTable。
主体代码如下:
2 #include " stdio.h "
3 using namespace std;
4 #include < String >
5 #include < Stack >
6 #include < set >
7 #include < queue >
8 #include < fstream >
9 #include < iomanip >
10 struct path
11 {
12 int from; // 一个path代表从一个状态到另一个状态的路径
13 int to;
14 string strPath; // strPath代表这个结构体处理的路径
15 };
16 struct dealStruct
17 {
18 int stCount;
19 struct path * resPtr;
20 };
21 struct setNode
22 {
23 set < int > * Dset;
24 int num;
25 struct setNode * next;
26 };
27 bool judgeLD( char v ) // 判断正则表达式中是否含有非法字符
28 {
29 if ( ( int (v) > 47 && int (v) <= 57 ) ||
30 ( int (v) >= 65 && int (v) <= 90 ) ||
31 ( int (v) >= 97 && int (v) <= 122 ) ||
32 v == ' > ' || v == ' < ' || v == ' = ' ||
33 v == ' ` ' || v == ' + ' || v == ' - ' )
34 return true ;
35 else
36 return false ;
37 }
38 struct dealStruct * dealWithStr( string , int ); // 通过正则表达式生成NFA
39 int jihe( int , stack < struct path *> & ); // 处理正则表达式时'*’的运算函数
40 int lianjie( int , stack < struct path *> & ); // 处理正则表达式时'&’(自定义运算符ab相当于a&b)的运算函数
41 int huo( int , stack < struct path *> & );; // 处理正则表达式时'|’的运算函数
42 int reduceNFA( int , int * , int ); // NFA->DFA函数
43 void BFS( int , int ); // 计算一个状态的ξ闭包
44 void produceC( int , string * ); // 生成词法分析器的.cpp文件
45 int arrForBFS[ 100 ] = { 0 }; // 存放BFS函数生成的闭包的数组
46 string T[ 2000 ][ 2000 ]; // NFA的数组表示
47 string recT[ 2000 ][ 2000 ]; // DFA的数组表示
48 int stateCountDFA; // DFA状态数量
49 int reduceState[ 2 ][ 2000 ]; // 化简DFA用的数组
50 set < char > tv; // 所有终结符
51 set < char > ::iterator Vt;
52 void reduceDFA(); // 化简DFA
53 void combine( int , int ); // 化简DFA时合并两个状态
54 int regCount; // 输入正则表达式的个数
55
56 int main()
57 {
58 regCount = 0 ;
59 int i,j,k;
60 int temp = 0 ;
61
62 string * regArr;
63 string * typeName;
64 int * typeIndex;
65 int ** resultEnterExt;
66
67
68 ifstream inRegular;
69 inRegular.open( " RegularExpressions.txt " );
70 inRegular >> regCount; // 正则表达式个数
71 typeName = new string [regCount];
72 typeIndex = new int [regCount];
73 resultEnterExt = new int * [regCount]; // 定义一个二位动态数组用于存放每个正则表达式所生成的NFA的起始状态和终结状态
74 for ( i = 0 ;i < regCount;i ++ )
75 resultEnterExt[i] = new int [ 2 ];
76 regArr = new string [regCount];
77 for ( i = 0 ;i < regCount;i ++ )
78 {
79 inRegular >> typeName[i];
80 inRegular >> regArr[i];
81 } // 读入正则表达式
82
83 int stateCount = - 1 ;
84 for ( i = 0 ;i < regCount;i ++ ) // 遍历每个正则表达式
85 {
86 for ( j = 0 ;j < regArr[i].length() - 1 ;j ++ )
87 if ( ( judgeLD(regArr[i].at(j) ) && ( judgeLD(regArr[i].at(j + 1 )) || regArr[i].at(j + 1 ) == ' ( ' ) ) ||
88 ( regArr[i].at(j) == ' ) ' && ( regArr[i].at(j + 1 ) == ' ( ' || judgeLD(regArr[i].at(j + 1 )) ) ) ||
89 ( regArr[i].at(j) == ' * ' && ( regArr[i].at(j + 1 ) == ' ( ' || judgeLD(regArr[i].at(j + 1 )) ) ) )
90 regArr[i].insert(j + 1 , " & " ); // 预处理正则表达式,添加自定义运算符'&'
91
92
93 regArr[i].insert(regArr[i].length(), " #% " ); // 在末尾添加结束符
94 cout << regArr[i] << endl; // 输出处理后的正则表达式
95
96
97 j = 0 ;
98 struct path * p = NULL;
99 struct dealStruct * result = dealWithStr( regArr[i],stateCount ); // 对于每个正则表达式调用NFA生成函数
100
101 stateCount = result -> stCount;
102 resultEnterExt[i][ 0 ] = result -> resPtr -> from; // 记录该正则表达式的起始状态和终结状态
103 resultEnterExt[i][ 1 ] = result -> resPtr -> to;
104 typeIndex[i] = resultEnterExt[i][ 1 ];
105 /* cout<<"The number of states now is "<<stateCount<<endl;
106 cout<<"The regular perform dealt with just now is "<<result->resPtr->strPath<<endl;
107 cout<<"The from dealt with just now is "<<resultEnterExt[i][0]<<endl;
108 cout<<"The to dealt with just now is "<<resultEnterExt[i][1]<<endl; */
109 }
110
111 stateCount ++ ; // 汇总所有正则表达式的NFA
112 cout << " NFA合并完毕! " << endl;
113
114 for ( i = 0 ;i < regCount;i ++ )
115 T[stateCount][resultEnterExt[i][ 0 ]] = ' ` ' ; // 从公共起始状态出发
116
117 /* for( i = 0;i <= stateCount;i++ ) //输出产生的NFA以检验
118 {
119 for( j = 0;j <= stateCount;j++ )
120 {
121 if( T[i][j] == "" ) cout<<" |";
122 else cout<<T[i][j]<<'|';
123 }
124 cout<<endl;
125 } */
126
127 stateCountDFA = reduceNFA( stateCount, typeIndex, regCount ); // 调用NFA->DFA函数生成DFA
128 reduceDFA();
129
130 produceC( stateCountDFA, typeName ); // 产生.cpp代码
131
132 system( " pause " );
133
134 return 0 ;
135 }
136
137 /* 以下为NFA的生成函数,采用三个栈实现Thompson算法,第一个栈用于读入字符和运算符
138 第二个栈用于逆序,第三个栈专门用于处理一对括号内部的表达式,其实用两个栈就可以
139 实现,及把第三个栈舍弃,直接把括号的计算放在第一个栈中进行也可。为了计算更加
140 清楚不易出错故而多加了一个栈*
141 算法思路:读入正则表达式放入栈A中,*,&,|的优先级分别定为3,2,1,而表达式末
142 尾的'#'优先级最低,看做0。用一个三元数组存放当各个运算符的个数,字符均放入栈A中,
143 读到运算符时,当栈外的运算符的优先级大于等于站内所有运算符时,该运算符进栈,
144 如果低于所有站内运算符,计算站内所有运算符表的运算,直到栈内没有比栈外更高优先
145 级的运算符。
146 读到(的时候,(进栈,之后不论读到什么均进栈直到读到第一个),将这对()中的字符串
147 提出,用栈B逆序后放入栈C,在栈C中按上述方式处理直到只有一个元素,将元素返回栈A
148 栈中存储的是指向path结构体的指针 */
149 struct dealStruct * dealWithStr( string str, int stateCountV )
150 {
151 int j = 0 ; int temp = 0 ;
152 struct path * p = NULL;
153 int currentPri[ 3 ] = { 0 }; // 用于储存栈中三种运算符(*,&,|)的个数,便于计算
154 stack < struct path *> stackA;
155 while ( str.at(j) != ' % ' )
156 {
157 if ( judgeLD(str.at(j)) ) // 如果是字符,放入栈A中
158 {
159 tv.insert(str.at(j)); // 放入tv中
160 stateCountV ++ ;
161 temp = stateCountV;
162 stateCountV ++ ;
163 T[temp][stateCountV] += str.at(j);
164 p = new path();
165 p -> from = temp;
166 p -> to = stateCountV;
167 p -> strPath += str.at(j);
168 stackA.push(p);
169 // cout<<"push a new letter "<<p->strPath<<endl;
170 j ++ ;
171 }
172 else if ( str.at(j) == ' * ' ) // 如果是'*'由于没有比它优先级更高的了,
173 { // 故必然是入栈
174 currentPri[ 0 ] ++ ;
175 p = new path();
176 p -> from = - 1 ;
177 p -> to = - 1 ;
178 p -> strPath += " * " ;
179 stackA.push(p);
180 // cout<<"push *"<<endl;
181 j ++ ;
182 }
183 else if ( str.at(j) == ' & ' ) // 如果是'&',检查之前有没有'*'
184 {
185 if ( currentPri[ 0 ] == 0 ) // 没有的话,入栈
186 currentPri[ 1 ] ++ ;
187 else
188 {
189 currentPri[ 0 ] -- ; // 有,计算*
190 currentPri[ 1 ] ++ ;
191
192 stateCountV = jihe( stateCountV, stackA );
193 }
194 p = new path();
195 p -> from = - 2 ;
196 p -> to = - 2 ;
197 p -> strPath = " & " ;
198 stackA.push(p); // 不论有没有'*','&'必然会入栈
199 // cout<<"push &"<<endl;
200 j ++ ;
201 }
202 else if ( str.at(j) == ' | ' ) // 如果是'|',查看前面有没有*,&
203 {
204 while ( currentPri[ 0 ] > 0 || currentPri[ 1 ] > 0 ) // 一直算直到没有
205 {
206 if ( currentPri[ 0 ] > 0 )
207 {
208 currentPri[ 0 ] -- ;
209 stateCountV = jihe( stateCountV, stackA );
210 }
211 else
212 {
213 currentPri[ 1 ] -- ;
214 stateCountV = lianjie( stateCountV, stackA );
215 }
216 }
217 currentPri[ 2 ] ++ ; // 不论有没有*,&,最后'|'必然还是要入栈
218 p = new path();
219 p -> from = - 3 ;
220 p -> to = - 3 ;
221 p -> strPath = " | " ;
222 stackA.push(p);
223 // cout<<"push |"<<endl;
224 j ++ ;
225 }
226 else if ( str.at(j) == ' # ' ) // 如果是'#',则计算前面所有的运算
227 {
228 while ( currentPri[ 0 ] > 0 || currentPri[ 1 ] > 0 || currentPri[ 2 ] > 0 )
229 {
230 if ( currentPri[ 0 ] > 0 )
231 {
232 currentPri[ 0 ] -- ;
233 stateCountV = jihe( stateCountV, stackA );
234 }
235 else if ( currentPri[ 1 ] > 0 )
236 {
237 currentPri[ 1 ] -- ;
238 stateCountV = lianjie( stateCountV, stackA );
239 }
240 else
241 {
242 currentPri[ 2 ] -- ;
243 stateCountV = huo( stateCountV, stackA );
244 }
245
246 }
247 // cout<<"The End, there is one struct path in stackA: "<<stackA.size()<<endl;
248 j ++ ;
249 // cout<<"It is "<<(stackA.top())->strPath<<endl;
250 }
251 else if ( str.at(j) == ' ( ' ) // 发现左括号,进入括号处理过程
252 {
253 // cout<<"发现一个最外层括号"<<endl;
254 int parenthCount = 1 ;
255 bool flag = true ;
256 while ( parenthCount > 0 )
257 {
258 if ( str.at(j) == ' ( ' )
259 {
260 p = new path();
261 p -> from = - 5 ;
262 p -> to = - 5 ;
263 p -> strPath += " ( " ;
264 stackA.push(p);
265 // cout<<"放入一左个括号"<<endl;
266 if ( ! flag )parenthCount ++ ;
267 flag = false ;
268 }
269 else if ( str.at(j) == ' ) ' ) // 发现第一个右括号
270 {
271 // cout<<"发现一个内层右括号"<<endl;
272 parenthCount -- ;
273
274 stack < struct path *> stackB; // 定义另两个栈
275 stack < struct path *> stackC;
276
277 p = new path();
278 p -> from = - 6 ;
279 p -> to = - 6 ;
280 p -> strPath += " # " ;
281 stackB.push(p);
282 while ( (stackA.top()) -> from != - 5 ) // 栈Apop到第一个(
283 {
284 stackB.push( stackA.top() ); // 将栈Apop出的放入B
285 // cout<<"stackA弹出一个"<<stackA.top()->strPath<<endl;
286 stackA.pop();
287
288 }
289 // cout<<"stackA想弹出一个左小括号"<<stackA.top()->strPath<<endl;
290 stackA.pop(); // 去掉左小括号
291
292 int pri[ 3 ] = { 0 };
293 while ( ! stackB.empty() ) // 开始处理栈B内的表达式
294 {
295 p = stackB.top();
296 stackB.pop();
297 // cout<<"stackB弹出一个"<<p->from<<endl;
298 if ( p -> from >= 0 )
299 stackC.push(p); // 用栈C处理栈B的表达式
300 else if ( p -> from == - 1 )
301 {
302 pri[ 0 ] ++ ;
303 stackC.push(p);
304 }
305 else if ( p -> from == - 2 )
306 {
307 if ( pri[ 0 ] > 0 )
308 {
309 pri[ 0 ] -- ;
310 stateCountV = jihe( stateCountV, stackC );
311 }
312
313 stackC.push(p);
314 pri[ 1 ] ++ ;
315 }
316 else if ( p -> from == - 3 )
317 {
318 while ( pri[ 0 ] > 0 || pri[ 1 ] > 0 )
319 {
320 if ( pri[ 0 ] > 0 )
321 {
322 pri[ 0 ] -- ;
323 stateCountV = jihe( stateCountV, stackC );
324 }
325 else
326 {
327 pri[ 1 ] -- ;
328 stateCountV = lianjie( stateCountV, stackC );
329 }
330
331 }
332 pri[ 2 ] ++ ;
333 stackC.push(p);
334 }
335 else if ( p -> from == - 6 )
336 {
337 while ( pri[ 0 ] > 0 || pri[ 1 ] > 0 || pri[ 2 ] > 0 )
338 {
339 if ( pri[ 0 ] > 0 )
340 {
341 pri[ 0 ] -- ;
342 stateCountV = jihe( stateCountV, stackC );
343 }
344 else if ( pri[ 1 ] > 0 )
345 {
346 pri[ 1 ] -- ;
347 stateCountV = lianjie( stateCountV, stackC );
348 }
349 else
350 {
351 pri[ 2 ] -- ;
352 stateCountV = huo( stateCountV, stackC );
353 }
354 }
355 }
356 else
357 cout << " 括号内含非法字符! " << endl;
358 }
359 // cout<<"stackC 中此时应该有1个东东"<<stackC.size()<<"它是"<<stackC.top()->strPath<<endl;
360
361 // cout<<"此时的parenthCount为"<<parenthCount<<endl;
362 stackA.push(stackC.top()); // 处理完成,栈C弹出结果
363 stackC.pop(); // 放入栈A
364 }
365 else if ( str.at(j) == ' * ' )
366 {
367 p = new path();
368 p -> from = - 1 ;
369 p -> to = - 1 ;
370 p -> strPath += " * " ;
371 stackA.push(p);
372 }
373 else if ( str.at(j) == ' & ' )
374 {
375 p = new path();
376 p -> from = - 2 ;
377 p -> to = - 2 ;
378 p -> strPath += " & " ;
379 stackA.push(p);
380 }
381 else if ( str.at(j) == ' | ' )
382 {
383 p = new path();
384 p -> from = - 3 ;
385 p -> to = - 3 ;
386 p -> strPath += " | " ;
387 stackA.push(p);
388 }
389 else
390 {
391 stateCountV ++ ;
392 temp = stateCountV;
393 stateCountV ++ ;
394 T[temp][stateCountV] += str.at(j);
395 p = new path();
396 p -> from = temp;
397 p -> to = stateCountV;
398 p -> strPath += str.at(j);
399 stackA.push(p);
400 }
401 tv.insert(str.at(j)); // 放入tv中
402 j ++ ;
403 }
404 }
405 else
406 cout << " 含有非法字符! " << endl;
407 }
408
409 struct dealStruct * res = new dealStruct();
410 res -> stCount = stateCountV;
411 res -> resPtr = stackA.top();
412 stackA.pop();
413 cout << " NFA生成完毕! " << endl;
414 return res;
415 }
416
417
418 int jihe( int stateV, stack < struct path *> & stackA )
419 {
420 struct path * b = stackA.top();stackA.pop();
421 struct path * a = stackA.top();stackA.pop();
422 // cout<<"pop the value for * "<<a->strPath<<endl;
423 // cout<<"pop the sign for * "<<b->strPath<<endl;
424
425 stateV ++ ;
426 int temp = stateV;
427 stateV ++ ;
428 T[temp][a -> from] += " ` " ;
429 T[temp][stateV] += " ` " ;
430 T[a -> to][stateV] += " ` " ;
431 T[a -> to][a -> from] += " ` " ;
432
433 struct path * p = new path();
434 p -> from = temp;
435 p -> to = stateV;
436 p -> strPath = " ( " + a -> strPath + " )* " ;
437 stackA.push(p);
438 // cout<<"push a new one of \""<<p->strPath<<"\""<<endl;
439
440 delete a;delete b;
441 return stateV;
442 }
443 int lianjie( int stateV, stack < struct path *> & stackV )
444 {
445 struct path * a = stackV.top();stackV.pop();
446 // cout<<"pop the second value for & "<<a->strPath<<endl;
447 struct path * b = stackV.top();stackV.pop();
448 // cout<<"pop the sign for & "<<b->strPath<<endl;
449 struct path * c = stackV.top();stackV.pop();
450 // cout<<"pop the first value for & "<<c->strPath<<endl;
451 T[c -> to][a -> from] += " ` " ;
452
453 struct path * p = new path();
454 p -> from = c -> from;
455 p -> to = a -> to;
456 p -> strPath = " ( " + c -> strPath + " )&( " + a -> strPath + " ) " ;
457 stackV.push(p);
458 // cout<<"push a new one of \""<<p->strPath<<"\""<<endl;
459 delete a;delete b;delete c;
460
461 return stateV;
462 }
463 int huo( int stateV, stack < struct path *> & stackA )
464 {
465 struct path * a = stackA.top();stackA.pop();
466 struct path * b = stackA.top();stackA.pop();
467 struct path * c = stackA.top();stackA.pop();
468 // cout<<"pop the first value for | "<<c->strPath<<endl;
469 // cout<<"pop the second value for | "<<a->strPath<<endl;
470 // cout<<"pop the sign for | "<<b->strPath<<endl;
471
472 stateV ++ ;
473 int temp = stateV;
474 stateV ++ ;
475 T[temp][a -> from] += " ` " ;
476 T[temp][c -> from] += " ` " ;
477 T[a -> to][stateV] += " ` " ;
478 T[c -> to][stateV] += " ` " ;
479
480 struct path * p = new path();
481 p -> from = temp;
482 p -> to = stateV;
483 p -> strPath = " ( " + c -> strPath + " )|( " + a -> strPath + " ) " ;
484 stackA.push(p);
485 // cout<<"push a new one of \""<<p->strPath<<"\""<<endl;
486 delete a;delete b;delete c;
487
488 return stateV;
489 }
490
491 int reduceNFA( int stateCountV, int * typeIndexV, int typeNumV ) // stateCountV为NFA的起始状态
492 {
493 int i,j,k;
494 struct setNode * head, * setPtr, * end; // 用于构造链表的指针,新状态用链表存储
495 struct setNode * currentPtr; // 用于遍历链表中元素的指针
496
497 set < int > ::iterator ite_int; // 每个新的状态为一个set,封装到setNode中
498 int countNode = 1 ;
499 setPtr = new setNode();
500 setPtr -> num = 1 ;
501 setPtr -> Dset = new set < int > ();
502 setPtr -> next = NULL;
503 BFS(stateCountV, stateCountV);
504 for ( i = 0 ;arrForBFS[i] >= 0 ;i ++ ) // 先把起始状态的ε闭包求出作为新起始状态
505 setPtr -> Dset -> insert(arrForBFS[i]);
506
507 head = setPtr;
508 end = setPtr;
509 currentPtr = head;
510
511 string letter1[ 80 ]; // 这里定义了两个数组,一个存放读入的字符
512 set < int > letter2[ 80 ]; // 一个存放读入相应字符后所到达的状态集合
513 int up = 0 ;
514
515
516 while ( currentPtr != NULL ) // 遍历新状态链表
517 {
518 // cout<<"currentPtr遍历到"<<currentPtr->num<<endl;
519 // 下面一个双重循环,第一个循环表示遍历新状态集合的每一个状态
520 // 第二个循环用于从原NFA找从这个状态出发到达的所有状态
521 for ( ite_int = currentPtr -> Dset -> begin(); ite_int != currentPtr -> Dset -> end(); ++ ite_int )
522 for ( i = 0 ;i <= stateCountV;i ++ )
523 if ( T[ * ite_int][i] != " ` " && T[ * ite_int][i] != "" )
524 {
525 bool flag2 = 0 ;
526 for ( j = 0 ;j < up;j ++ )
527 if ( letter1[j] == T[ * ite_int][i] )
528 {
529 flag2 = 1 ;
530 break ;
531 }
532 if ( ! flag2 )
533 {
534 up ++ ;
535 letter1[up - 1 ] = T[ * ite_int][i]; // 读入的字符都记录下
536 }
537 BFS( i, stateCountV );
538 for ( k = 0 ;arrForBFS[k] >= 0 ;k ++ )
539 letter2[j].insert(arrForBFS[k]); // 所到的状态也记下
540 }
541
542 /* cout<<"从新的状态"<<currentPtr->num<<"生成的矩阵是"<<endl;
543 for( i = 0;i < up;i++ )
544 {
545 cout<<letter1[i]<<" ";
546 for( ite_int = letter2[i].begin(); ite_int != letter2[i].end(); ++ite_int )
547 cout<<*ite_int<<" ";
548 cout<<endl;
549 }//最后letter1,letter2记录了这个新状态读入不同的字符后会到达的状态合集 */
550
551
552 for ( i = 0 ;i < up;i ++ ) // 下面开始比较这些状态是不是以前有过的
553 {
554 struct setNode * p = head;
555 while ( p != NULL )
556 {
557 if ( letter2[i] == * (p -> Dset) )
558 break ;
559 p = p -> next;
560 }
561 if ( p != NULL ) // 有这个状态,只要加一条路径即可
562 recT[currentPtr -> num][p -> num] += letter1[i];
563 else // 没有这个状态,要在链表末加一个节点,并在recT中加路径
564 {
565 countNode ++ ;
566 recT[currentPtr -> num][countNode] += letter1[i];
567 setPtr = new setNode();
568 setPtr -> num = countNode;
569 setPtr -> Dset = new set < int > ();
570 for ( ite_int = letter2[i].begin(); ite_int != letter2[i].end(); ++ ite_int )
571 setPtr -> Dset -> insert( * ite_int );
572 setPtr -> next = NULL;
573 end -> next = setPtr;
574 end = setPtr;
575
576 for ( k = 0 ;k < typeNumV;k ++ )
577 {
578 ite_int = setPtr -> Dset -> find(typeIndexV[k]);
579 if ( ite_int != setPtr -> Dset -> end() || * (setPtr -> Dset -> end()) == stateCountV )
580 {
581 char temp[ 10 ];
582 string str = itoa(k,temp, 10 );
583 recT[setPtr -> num][ 0 ] = " - " + str; // 如果是终止状态,做记号
584 recT[ 0 ][setPtr -> num] = " - " + str;
585 }
586 }
587 }
588 }
589 for ( i = 0 ;i < up;i ++ ) // 清空letter1,letter2,用于下一轮遍历
590 {
591 letter1[i] = "" ;
592 letter2[i].clear();
593 }
594 up = 0 ;
595 currentPtr = currentPtr -> next;
596
597 /* cout<<"此时的链表为"<<endl;
598 struct setNode *p2 = head;
599 while( p2 != NULL )
600 {
601 cout<<p2->num<<" ";
602 p2 = p2->next;
603 }
604 cout<<endl; */
605 }
606
607 cout << " DFA生成完毕! " << endl;
608 /* cout<<"最后DFA为:"<<endl;
609 for( i = 0;i <= countNode;i++,cout<<endl )
610 for( j = 0; j <= countNode;j++ )
611 {
612 if( recT[i][j] == "" )cout<<" |";
613 else
614 cout<<setw(2)<<recT[i][j]<<"|";
615 } */
616 return countNode;
617
618 }
619
620 void BFS( int row, int stateCountV ) // 广度搜索法求一个状态的ε闭包
621 {
622 for ( int i = 0 ;i < 100 ;i ++ )
623 arrForBFS[i] = - 1 ;
624 int ite = - 1 ;
625 queue < int > que;
626 que.push(row);
627 ite ++ ;
628 arrForBFS[ite] = row;
629 while ( ! que.empty() )
630 {
631 int t = que.front();
632 que.pop();
633 for ( int i = 0 ;i <= stateCountV;i ++ )
634 if ( T[t][i] == " ` " )
635 {
636 ite ++ ;
637 arrForBFS[ite] = i;
638 que.push(i);
639 }
640 }
641 }
642
643 void reduceDFA() // 化简DFA算法
644 {
645 /*
646 算法思想:
647 构造二维数组reduceState[2][stateCountDFA]
648 矩阵第一行存储各状态当前所属集合标记
649 矩阵第二行存储个状态读入某个字符后的目标状态所属集合标记
650 每读入某个字符后,矩阵两行相对应位置都相等的状态为同一集合
651 第一行相等而第二行不相等的两状态分到不同集合,即将后一个状态的所属集合标记+1
652 全部字符读完后,扫描矩阵的第一行,即当前所属集合标记,相等的状态合并
653 简化完成
654 */
655 int pos = 1 ; // 非终结状态所属集合标记
656 int poe; // 终结状态所属集合标记
657 for ( int i = 1 ;i <= stateCountDFA;i ++ ) // 该循环用于给所有状态的所属集合赋初值,终结状态为stateCountDFA,非终结状态为1.
658 {
659 if (recT[ 0 ][i].find( ' - ' ) != string ::npos)
660 {
661 poe = stateCountDFA + atoi( & recT[ 0 ][i].at( 1 )); // 将不同正则表达式的终结状态标注为不同集合:stateCountDFA+正则表达式序号
662 reduceState[ 0 ][i] = poe;
663 reduceState[ 1 ][i] = poe;
664 }
665 else
666 {
667 reduceState[ 0 ][i] = pos;
668 reduceState[ 1 ][i] = pos;
669 }
670 }
671 poe = stateCountDFA + regCount + 1 ;
672 bool flag; // bool标记,读每个字符,调整集合状态后,重新读入该字符,直到不发生集合改变为止
673 for (Vt = tv.begin();Vt != tv.end();Vt ++ ) // 读入每个字符的循环
674 {
675 // cout<<*Vt;
676 flag = true ;
677 while (flag) // 该循环即为重新读入当前字符的循环
678 {
679 flag = false ;
680 bool item = true ; // 用于标记某状态读入当前字符是否产生状态变化
681 for ( int j = 1 ;j <= stateCountDFA;j ++ )
682 {
683 item = false ;
684 for ( int k = 1 ;k <= stateCountDFA;k ++ ) // 循环遍历DFA
685 {
686 if (recT[j][k].find( * Vt) != string ::npos) // 若当前状态读入该字符产生状态变化
687 {
688 item = true ;
689 reduceState[ 1 ][j] = reduceState[ 0 ][k]; // 目标集合赋值为目标状态所属集合
690 }
691 if ( ! item) // 不产生状态变化
692 reduceState[ 1 ][j] = 0 ; // 目标集合标记为0
693 }
694
695 }
696 bool b1 = false ; // 标记用于判断pos是否加1
697 bool b2 = false ; // 标记用于判断poe是否加1
698 for ( int p = 1 ;p <= stateCountDFA;p ++ ) // 循环扫描reduceState矩阵.第一个标记位置
699 {
700 b1 = false ;b2 = false ;
701 for ( int q = 2 ;q <= stateCountDFA;q ++ ) // 循环扫描reduceState矩阵.第二个标记位置
702 {
703 if (reduceState[ 0 ][p] == reduceState[ 0 ][q] && reduceState[ 1 ][p] != reduceState[ 1 ][q])
704 {
705 // 如果存在所属集合相等,目标集合不等的情况,将所属集合分开
706 if (reduceState[ 0 ][q] < stateCountDFA) // 当前状态为非终结状态
707 {
708 b1 = true ;
709 reduceState[ 0 ][q] = pos;
710 }
711 else // 当前状态为终结状态
712 {
713 b2 = true ;
714 reduceState[ 0 ][q] = poe;
715 }
716 flag = true ; // 发生集合调整,标记为true
717 }
718 }
719 if (b1)
720 pos ++ ;
721 if (b2)
722 poe ++ ;
723 }
724 }
725 }
726
727 // for(int i1=0;i1<stateCountDFA;i1++)
728 // cout<<reduceState[0][i1]<<" ";
729 // cout<<endl;
730 // for(i1=0;i1<stateCountDFA;i1++)
731 // cout<<reduceState[1][i1]<<" ";
732
733 for ( int j = 1 ;j < stateCountDFA;j ++ ) // 循环扫描reduceState矩阵第一行.第一个标记位置
734 {
735 if (reduceState[ 0 ][j] != 0 )
736 {
737 for ( int k = j + 1 ;k <= stateCountDFA;k ++ ) // 循环扫描reduceState矩阵第一行.第二个标记位置
738 {
739 if (reduceState[ 0 ][j] == reduceState[ 0 ][k]) // 两个标记位置值相等,即所属集合相同
740 combine(j,k); // 调用合并函数
741 }
742 }
743 }
744 /* cout<<"化简后的DFA为:"<<endl;
745 for(int m = 0;m <= stateCountDFA;m++,cout<<endl )
746 for(int n = 0; n <= stateCountDFA;n++ )
747 {
748 if( recT[m][n] == "" )cout<<" |";
749 else
750 cout<<recT[m][n]<<"|";
751 } */
752
753 cout << " DFA简化完毕! " << endl;
754 }
755
756 void combine( int j, int k)
757 {
758 /*
759 函数用于合并两个状态
760 把第K状态的相应的行和列的值加到第j状态上
761 并把k状态所有的值置空
762 */
763 for ( int i = 1 ;i <= stateCountDFA;i ++ )
764 {
765 recT[j][i] += recT[k][i];
766 recT[i][j] += recT[i][k];
767 }
768 for (i = 0 ;i <= stateCountDFA;i ++ )
769 {
770 recT[k][i] = "" ;
771 recT[i][k] = "" ;
772 }
773 reduceState[ 0 ][k] = 0 ;
774 }
775
776 void produceC( int nodeCountV, string * typeNameV ) // 产生.cpp文件
777 {
778 int i, j ,k;
779 char buf[ 1024 ]; // 临时保存读取出来的文件内容
780 string message = "" ;
781 string remainCode = "" ;
782 ifstream in ;
783 in .open( " Remain.txt " );
784 if ( in .is_open()) // 文件打开成功,说明曾经写入过东西
785 {
786 while ( in .good() && ! in .eof())
787 {
788 memset(buf, 0 , 1024 );
789 in .getline(buf, 1204 );
790 message = buf;
791 message += " \n " ;
792 remainCode += message;
793 }
794 in .close();
795 }
796
797 ofstream fout( " scanner.cpp " ,ios:: out );
798 fout << remainCode;
799 fout << " while(flag)\n " << " {\n " ;
800 fout << " switch(state)\n " << " {\n " ;
801 for ( i = 1 ;i <= nodeCountV;i ++ ) // 遍历每个新状态
802 {
803 fout << " case " << i << " :\n " ;
804 fout << " last++;\n " ;
805 fout << " c = code[last];\n " ;
806 if ( i == 1 )
807 {
808 fout << " if( c == '%' )\n " ; // 读到文件末尾,终结
809 fout << " {\n " ;
810 fout << " flag = 0;\n " ;
811 fout << " break;\n}\n " ;
812 }
813 bool flag3 = 0 ; bool flag2 = 0 ;
814 for ( j = 1 ;j <= nodeCountV;j ++ )
815 if ( recT[i][j] != " ' " && recT[i][j] != "" )
816 {
817 flag3 = 1 ;
818 if ( flag2 )
819 fout << " else " ;
820
821 fout << " if( contain( \ "" <<recT[i][j]<< " \ " ,c ) ) " ;
822 fout << " state = " << j << " ;\n " ;
823 flag2 = 1 ;
824 }
825 if ( flag3 )
826 fout << " else\n{\n " ;
827 // fout<<"{\n"<<"endState.find("<<i<<");\n";
828 // fout<<"if( ite_int == endState.find && *(endState.end()) != "<<i<<")\n";
829 if ( recT[i][ 0 ][ 0 ] == ' - ' )
830 {
831 string tempIndexStr = recT[i][ 0 ].substr( 1 ,recT[i][ 0 ].length() - 1 );
832 int tempIndex = atoi(tempIndexStr.c_str());
833 string type = typeNameV[tempIndex];
834 fout << " flag2 = true;\n " ;
835 fout << " for( k = 0;k < count;k++ )\n " ;;
836 fout << " {\n " << " if( SymbolTable[k][1] == code.substr(first,last-first) )\n " ;
837 fout << " {\nflag2 = false;\nbreak;\n}\n}\n " ;
838 fout << " if(flag2)\n{\n " ;
839 fout << " count++;\n " ;
840 fout << " SymbolTable[count-1][0] = \ "" <<type<< " \ " ;\n " ;
841 fout << " SymbolTable[count-1][1] = code.substr(first,last-first);\n " ;
842 fout << " }\n " ;
843
844 fout << " cout<<code.substr(first,last-first)<<endl;\n " ;
845 fout << " last--;\n " ;
846 fout << " first = last+1;\n " ;
847 fout << " state = 1;\n " ;
848 }
849 else
850 {
851 fout << " cout<<\ " 编译错误 ! \\n\ " ;\n " ;
852 fout << " flag = 0;\n " ;
853 }
854 if ( flag3 )
855 fout << " }\n " ;
856 fout << " break;\n " ;
857 }
858 fout << " default:\n " << " break;\n " << " }\n}\n " ;
859
860 fout << " cout<<\ " Symbol 表如下:\ " <<endl;\n " ;
861 fout << " cout<<setw(3)<<\ " 序号\ " <<setw(10)<<\ " token类型\ " <<\ " \ " <<\ " token名字\ " <<endl;\n " ;
862 fout << " for( k2 = 0;k2 < count;k2++ )\n " ;
863 fout << " {\n " << " cout<<setw(3)<<k2<<setw(10)<<SymbolTable[k2][0]<<\ " \ " <<SymbolTable[k2][1]<<endl;\n " ;
864 fout << " }\n " ;
865 fout << " system(\ " pause\ " );\n " ;
866 fout << " return 0;\n}\n " ;
867
868 cout << " 代码生成完毕!见scanner.cpp文件。 " << endl;
869 }
正则表达式在RegularExpressions.txt中写明,示例如下:
Identity
a(bc) *| bc *| d
Number
( 1 | 2 | 3 | 4 | 5 | 6 | 7 | 8 | 9 | 0 ) *
key
if | else | then
operator
> (` |= ) |< (` |= ) |= ( =| `)
第一行为正则表达式的个数,之后输入表达式代表的类型名,回车,表达式,回车,以此类推。
remain.txt存放着生成的代码文件中固定的代码,生成代码时会先从此文件读取。
里面的代码如下:
using namespace std;
#include < string >
#include < fstream >
#include < set >
#include < iomanip >
string SymbolTable[ 1000 ][ 2 ] = { "" };
bool contain( string str, char v )
{
for ( int i = 0 ;i < str.length();i ++ )
if ( str[i] == v )
return true ;
return false ;
}
int main()
{
string tempCode = "" ;
string code = "" ;
int i,j,k,k2 = 0 ;
char buf[ 1024 ]; // 临时保存读取出来的文件内容
string message = "" ;
ifstream infile;
infile.open( " code.txt " );
if (infile.is_open()) // 文件打开成功,说明曾经写入过东西
{
while (infile.good() && ! infile.eof())
{
memset(buf, 0 , 1024 );
infile.getline(buf, 1204 );
message = buf;
tempCode += message;
}
infile.close();
}
for ( i = 0 ;i < tempCode.length();i ++ )
{
if ( tempCode[i] != ' ' && tempCode[i] != ' \t ' && tempCode[i] != ' \n ' && tempCode[i] != ' ; ' )
code += tempCode[i];
}
code += " % " ;
cout << code << endl;
int first = 0 , last = - 1 ;
char c;
string tempStr = "" ;
int state = 1 ;
bool flag = 1 ;
bool flag2 = true ;
int count = 0 ;
// set<int> endState;
// set<int>::iterator ite_int;
// /保留代码 ///
表达式读入生成的DFA简化后生成.cpp文件,命名为scanner.cpp
根据上面的那个示例,生成的scanner.cpp文件内容为:
using namespace std;
#include < string >
#include < fstream >
#include < set >
#include < iomanip >
string SymbolTable[ 1000 ][ 2 ] = { "" };
bool contain( string str, char v )
{
for ( int i = 0 ;i < str.length();i ++ )
if ( str[i] == v )
return true ;
return false ;
}
int main()
{
string tempCode = "" ;
string code = "" ;
int i,j,k,k2 = 0 ;
char buf[ 1024 ]; // 临时保存读取出来的文件内容
string message = "" ;
ifstream infile;
infile.open( " code.txt " );
if (infile.is_open()) // 文件打开成功,说明曾经写入过东西
{
while (infile.good() && ! infile.eof())
{
memset(buf, 0 , 1024 );
infile.getline(buf, 1204 );
message = buf;
tempCode += message;
}
infile.close();
}
for ( i = 0 ;i < tempCode.length();i ++ )
{
if ( tempCode[i] != ' ' && tempCode[i] != ' \t ' && tempCode[i] != ' \n ' && tempCode[i] != ' ; ' )
code += tempCode[i];
}
code += " % " ;
cout << code << endl;
int first = 0 , last = - 1 ;
char c;
string tempStr = "" ;
int state = 1 ;
bool flag = 1 ;
bool flag2 = true ;
int count = 0 ;
// set<int> endState;
// set<int>::iterator ite_int;
// /保留代码 ///
while (flag)
{
switch (state)
{
case 1 :
last ++ ;
c = code[last];
if ( c == ' % ' )
{
flag = 0 ;
break ;
}
if ( contain( " a " ,c ) ) state = 2 ;
else if ( contain( " b " ,c ) ) state = 3 ;
else if ( contain( " d " ,c ) ) state = 4 ;
else if ( contain( " 1234567890 " ,c ) ) state = 5 ;
else if ( contain( " i " ,c ) ) state = 15 ;
else if ( contain( " e " ,c ) ) state = 16 ;
else if ( contain( " t " ,c ) ) state = 17 ;
else if ( contain( " ><= " ,c ) ) state = 18 ;
else
{
cout << " 编译错误!\n " ;
flag = 0 ;
}
break ;
case 2 :
last ++ ;
c = code[last];
if ( contain( " bb " ,c ) ) state = 21 ;
else
{
flag2 = true ;
for ( k = 0 ;k < count;k ++ )
{
if ( SymbolTable[k][ 1 ] == code.substr(first,last - first) )
{
flag2 = false ;
break ;
}
}
if (flag2)
{
count ++ ;
SymbolTable[count - 1 ][ 0 ] = " Identity " ;
SymbolTable[count - 1 ][ 1 ] = code.substr(first,last - first);
}
cout << code.substr(first,last - first) << endl;
last -- ;
first = last + 1 ;
state = 1 ;
}
break ;
case 3 :
last ++ ;
c = code[last];
if ( contain( " c " ,c ) ) state = 3 ;
else
{
flag2 = true ;
for ( k = 0 ;k < count;k ++ )
{
if ( SymbolTable[k][ 1 ] == code.substr(first,last - first) )
{
flag2 = false ;
break ;
}
}
if (flag2)
{
count ++ ;
SymbolTable[count - 1 ][ 0 ] = " Identity " ;
SymbolTable[count - 1 ][ 1 ] = code.substr(first,last - first);
}
cout << code.substr(first,last - first) << endl;
last -- ;
first = last + 1 ;
state = 1 ;
}
break ;
case 4 :
last ++ ;
c = code[last];
flag2 = true ;
for ( k = 0 ;k < count;k ++ )
{
if ( SymbolTable[k][ 1 ] == code.substr(first,last - first) )
{
flag2 = false ;
break ;
}
}
if (flag2)
{
count ++ ;
SymbolTable[count - 1 ][ 0 ] = " Identity " ;
SymbolTable[count - 1 ][ 1 ] = code.substr(first,last - first);
}
cout << code.substr(first,last - first) << endl;
last -- ;
first = last + 1 ;
state = 1 ;
break ;
case 5 :
last ++ ;
c = code[last];
if ( contain( " 1121233123444123455551234566666123456777777123456788888881234567899999999123456789000000000 " ,c ) ) state = 5 ;
else
{
flag2 = true ;
for ( k = 0 ;k < count;k ++ )
{
if ( SymbolTable[k][ 1 ] == code.substr(first,last - first) )
{
flag2 = false ;
break ;
}
}
if (flag2)
{
count ++ ;
SymbolTable[count - 1 ][ 0 ] = " Number " ;
SymbolTable[count - 1 ][ 1 ] = code.substr(first,last - first);
}
cout << code.substr(first,last - first) << endl;
last -- ;
first = last + 1 ;
state = 1 ;
}
break ;
case 6 :
last ++ ;
c = code[last];
cout << " 编译错误!\n " ;
flag = 0 ;
break ;
case 7 :
last ++ ;
c = code[last];
cout << " 编译错误!\n " ;
flag = 0 ;
break ;
case 8 :
last ++ ;
c = code[last];
cout << " 编译错误!\n " ;
flag = 0 ;
break ;
case 9 :
last ++ ;
c = code[last];
cout << " 编译错误!\n " ;
flag = 0 ;
break ;
case 10 :
last ++ ;
c = code[last];
cout << " 编译错误!\n " ;
flag = 0 ;
break ;
case 11 :
last ++ ;
c = code[last];
cout << " 编译错误!\n " ;
flag = 0 ;
break ;
case 12 :
last ++ ;
c = code[last];
cout << " 编译错误!\n " ;
flag = 0 ;
break ;
case 13 :
last ++ ;
c = code[last];
cout << " 编译错误!\n " ;
flag = 0 ;
break ;
case 14 :
last ++ ;
c = code[last];
cout << " 编译错误!\n " ;
flag = 0 ;
break ;
case 15 :
last ++ ;
c = code[last];
if ( contain( " f " ,c ) ) state = 23 ;
else
{
cout << " 编译错误!\n " ;
flag = 0 ;
}
break ;
case 16 :
last ++ ;
c = code[last];
if ( contain( " l " ,c ) ) state = 24 ;
else
{
cout << " 编译错误!\n " ;
flag = 0 ;
}
break ;
case 17 :
last ++ ;
c = code[last];
if ( contain( " h " ,c ) ) state = 25 ;
else
{
cout << " 编译错误!\n " ;
flag = 0 ;
}
break ;
case 18 :
last ++ ;
c = code[last];
if ( contain( " === " ,c ) ) state = 26 ;
else
{
flag2 = true ;
for ( k = 0 ;k < count;k ++ )
{
if ( SymbolTable[k][ 1 ] == code.substr(first,last - first) )
{
flag2 = false ;
break ;
}
}
if (flag2)
{
count ++ ;
SymbolTable[count - 1 ][ 0 ] = " operator " ;
SymbolTable[count - 1 ][ 1 ] = code.substr(first,last - first);
}
cout << code.substr(first,last - first) << endl;
last -- ;
first = last + 1 ;
state = 1 ;
}
break ;
case 19 :
last ++ ;
c = code[last];
cout << " 编译错误!\n " ;
flag = 0 ;
break ;
case 20 :
last ++ ;
c = code[last];
cout << " 编译错误!\n " ;
flag = 0 ;
break ;
case 21 :
last ++ ;
c = code[last];
if ( contain( " c " ,c ) ) state = 2 ;
else
{
cout << " 编译错误!\n " ;
flag = 0 ;
}
break ;
case 22 :
last ++ ;
c = code[last];
cout << " 编译错误!\n " ;
flag = 0 ;
break ;
case 23 :
last ++ ;
c = code[last];
flag2 = true ;
for ( k = 0 ;k < count;k ++ )
{
if ( SymbolTable[k][ 1 ] == code.substr(first,last - first) )
{
flag2 = false ;
break ;
}
}
if (flag2)
{
count ++ ;
SymbolTable[count - 1 ][ 0 ] = " key " ;
SymbolTable[count - 1 ][ 1 ] = code.substr(first,last - first);
}
cout << code.substr(first,last - first) << endl;
last -- ;
first = last + 1 ;
state = 1 ;
break ;
case 24 :
last ++ ;
c = code[last];
if ( contain( " s " ,c ) ) state = 30 ;
else
{
cout << " 编译错误!\n " ;
flag = 0 ;
}
break ;
case 25 :
last ++ ;
c = code[last];
if ( contain( " e " ,c ) ) state = 31 ;
else
{
cout << " 编译错误!\n " ;
flag = 0 ;
}
break ;
case 26 :
last ++ ;
c = code[last];
flag2 = true ;
for ( k = 0 ;k < count;k ++ )
{
if ( SymbolTable[k][ 1 ] == code.substr(first,last - first) )
{
flag2 = false ;
break ;
}
}
if (flag2)
{
count ++ ;
SymbolTable[count - 1 ][ 0 ] = " operator " ;
SymbolTable[count - 1 ][ 1 ] = code.substr(first,last - first);
}
cout << code.substr(first,last - first) << endl;
last -- ;
first = last + 1 ;
state = 1 ;
break ;
case 27 :
last ++ ;
c = code[last];
cout << " 编译错误!\n " ;
flag = 0 ;
break ;
case 28 :
last ++ ;
c = code[last];
cout << " 编译错误!\n " ;
flag = 0 ;
break ;
case 29 :
last ++ ;
c = code[last];
cout << " 编译错误!\n " ;
flag = 0 ;
break ;
case 30 :
last ++ ;
c = code[last];
if ( contain( " e " ,c ) ) state = 23 ;
else
{
cout << " 编译错误!\n " ;
flag = 0 ;
}
break ;
case 31 :
last ++ ;
c = code[last];
if ( contain( " n " ,c ) ) state = 23 ;
else
{
cout << " 编译错误!\n " ;
flag = 0 ;
}
break ;
case 32 :
last ++ ;
c = code[last];
cout << " 编译错误!\n " ;
flag = 0 ;
break ;
case 33 :
last ++ ;
c = code[last];
cout << " 编译错误!\n " ;
flag = 0 ;
break ;
default :
break ;
}
}
cout << " Symbol 表如下: " << endl;
cout << setw( 3 ) << " 序号 " << setw( 10 ) << " token类型 " << " " << " token名字 " << endl;
for ( k2 = 0 ;k2 < count;k2 ++ )
{
cout << setw( 3 ) << k2 << setw( 10 ) << SymbolTable[k2][ 0 ] << " " << SymbolTable[k2][ 1 ] << endl;
}
system( " pause " );
return 0 ;
}
等待编译的代码写在code.txt中,这个txt相当于代码输入界面,这里的内容对应着上面的那些正则表实例,内容如下:
bcc = 123 ;
else if d < 9
abc = 12 ;
运行主体代码后,scanner.cpp里的代码编译后的程序会读入code.txt中的代码,再在运行界面上输出token串和Symbol Table,大功告成!