为了巩固和确认当前成果,换一个语言去把词法分析、递归下降语法分析再做一遍,其中使用正则表达式去实现词法分析,实现一下解析四则运算表达式。考虑到相关需求,使用 ts 去进行实现,理由如下:
之前在 java 里写解析器是逐字符来的,这次想试试用正则,而 js 有正则字面量,写正则表达式比 java 舒服一些,特别是处理转义符的时候(虽然这里估计用不到很复杂的) 
ts 支持代数数据类型,用来对 Token 和 AST 建模很方便 
js 方便做AST的可视化……本来想做的,但发现直接展示的话没啥乐子,想做个步进又发现过于复杂了。先摆了,将来再说 
 
同时也期待这玩意能作为一个原型,打算等后面把 jlox 实现完后整个给它搬到浏览器上,如果我真的学的完的话。
该四则运算语言中,支持+,-,*,/,**(乘方,没有这个就无聊了)和括号,优先级、结合性和 js 中的一致。为了让它更整蛊,字面量允许字符串和数字,其中-,/,%,** 等不允许左右两边出现字符串字面量(这个限制在计算的时候:P)。
词法分析 词法分析需要定义词素,定义下面的 Token 类型(虽然这里会忽略空白,但为了后面能正确维护 Scanner 的状态,还是需要知道空白的数量以及其中包含多少个换行符的,所以另外定义了一个 BLANK token,但后续会直接扔掉):
type  TokenValue  = {   type : 'NUMBER' ,   literal : number  } | {   type : 'STRING' ,   literal : string  } | {   type : 'PLUS'  | 'MINUS'  | 'SLASH'  | 'STAR'  | 'PERCENT'  | 'LEFT_PAREN'  | 'RIGHT_PAREN'  | 'STAR_STAR'  | 'BLANK'  | 'EOF'  }type  Token  = TokenValue  & {   lexeme : string ,    line : number   }
 
然后,需要为单字符的 token,双字符的 token,字符串字面量,数字字面量,还有空白,为它们分别定义正则表达式,然后把它们用|拼接到一起,并且给予 g flag——这让正则表达式能够包含状态,设置正则从字符串的特定位置去开始进行匹配。
要注意最长匹配原则——如果**能匹配,就不要匹配成两个*,因此各 token 的顺序需要被正确处理——如果两个正则表达式有歧义,较长的要放前面。
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 const  REGEXP  = (() =>  {   const  DOUBLE_TOKEN_REGEXP  = /(\*\*)/    const  SINGLE_TOKEN_REGEXP  = /(\+|\-|\*|\/|\%|\(|\))/    const  STRING_LITERAL_REGEXP  = /"(.*?)(?<!\\)"/     const  NUMBER_LITERAL_REGEXP  = /(\d+(?:\.\d+)?)/     const  IGNORE_TOKEN_REGEXP  = /(\s+)/    const  ANOTHER_STRING_LITERAL_REGEXP  = /"((?:[^"\\]|\\.)*)"/     return  new  RegExp ([     DOUBLE_TOKEN_REGEXP ,     SINGLE_TOKEN_REGEXP ,     STRING_LITERAL_REGEXP ,     NUMBER_LITERAL_REGEXP ,     IGNORE_TOKEN_REGEXP    ].map (regexp  =>  regexp.source ).join ('|' ), 'g' ) })()function  tryMatch (code: string , startIndex: number , line: number  ): Token  {      REGEXP .lastIndex  = startIndex   const  match = REGEXP .exec (code)      if  (!match || match.index  != startIndex) {     throw  new  Error (`词法错误!行数:${line} ` )   }      if  (match[1 ] != undefined ) {     return  { type : 'STAR_STAR' , line, lexeme : match[1 ] }   }   if  (match[2 ] != undefined ) {     const  map = { '+' : 'PLUS' , '-' : 'MINUS' , '*' : 'STAR' , '/' : 'SLASH' , '%' : 'PERCENT' , '(' : 'LEFT_PAREN' , ')' : 'RIGHT_PAREN'  } as  const      return  { type : map[match[2 ] as  keyof typeof  map], lexeme : match[2 ], line }   }   if  (match[3 ] != undefined ) {     return  { type : 'STRING' , lexeme : `"${match[3 ]} "` , line, literal : match[3 ]}   }   if  (match[4 ] != undefined ) {     return  { type : 'NUMBER' , lexeme : match[4 ], line, literal : parseInt (match[4 ])}   }   if  (match[5 ] != undefined ) {     return  { type : 'BLANK' , lexeme : match[5 ], line }   }   console .log (match)   throw  new  Error ("Impossible" ) }function  readTokens (code: string  ): Token [] {      let  line = 1 ;   let  current = 0 ;   const  res : Token [] = []   while  (current < code.length ) {     const  token = tryMatch (code, current, line)     if  (token.type  !== 'BLANK' ){       res.push (token)     }     const  lexeme = token.lexeme           current += lexeme.length       for  (const  char of  lexeme) {       if  (char === '\n' ) line++     }   }   res.push ({ type : 'EOF' , line, lexeme : ''  })   return  res }
 
逻辑比 java 还清晰些……主要是不需要同时维护两个指针,不需要在字符的粒度上去进行编码,缺点是无法给予清晰的报错。
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 console .log (readTokens (`  -1 + 2.3  2+2.3+"123"+3.4-1*2/3**4* *5%6 7(1+2) "11\\"23\\"44"` ))
 
语法分析 然后是语法分析,首先确认这些运算符的优先级和结合性,下表优先级从低到高。
运算符 
结合性 
 
 
- + 
左 
 
* / % 
左 
 
** 
右 
 
unary - + 
右 
 
根据优先级,定义相应 BNF,注意这里的乘方的规则 Exponent -> Unary ("**" Exponent)?——乘方是右结合的,因此右方要允许其它乘方的出现且为它的右子树,这也体现在 parser 中。
Expr     -> Term Term     -> Factor (("-" | "+") Factor)* Factor   -> Exponent (("*" | "/" | "%") Exponent)* Exponent -> Unary ("**" Exponent)?  # 注意这里! Unary    -> ("+" | "-") Unary | Primary Primary  -> NUMBER | STRING | "(" Expr ")"
 
说到底递归下降法为啥是这样呢,该怎样建立它的心智模型呢……想不通啊想不通
然后实现解析器,这里实际上只有三种类型了——一元操作,二元操作和字面量,这里没必要去表示括号,但还是给它带上。
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 type  Expr  = {   type : 'UNARY' , op : Token , expr : Expr  } | {   type : 'BINARY' , left : Expr , op : Token ,  right : Expr  } | {   type : 'LITERAL' , value : string  | number  } | {   type : 'GROUPING' , expr : Expr  }function  parse (tokens: Token[] ): Expr  {         let  current = 0 ;   const  isAtEnd  = ( ) => current >= tokens.length    const  peek  = ( ) => tokens[current]   const  advance  = ( ) => isAtEnd () ? peek () : tokens[current++]   const  previous  = ( ) => tokens[current - 1 ]   function  match (...tokenTypes: Token['type' ][] ): boolean  {     if  (isAtEnd ()) return  false      const  current = peek ()     for  (const  tokenType of  tokenTypes) {       if  (current.type  === tokenType) {         advance ()         return  true        }     }     return  false    }   function  error (token: Token, msg: string  ): Error  {     throw  new  Error (`语法错误,#{msg}。行数:${token.line} ` )   }   function  consume (tokenType: Token['type' ], msg: string  ) {     if  (match (tokenType)) {       return      }     throw  error (peek (), msg)   }      function  expr ( ): Expr  {     return  term ()   }   function  term ( ): Expr  {     let  left = factor ()     while  (match ('MINUS' , 'PLUS' )) {       const  op = previous ()       const  right = factor ()       left = {type : 'BINARY' , op, left, right}     }     return  left;   }   function  factor ( ): Expr  {     let  left = exponent ()     while  (match ('STAR' , 'SLASH' , 'PERCENT' )) {       const  op = previous ()       const  right = exponent ()       left = {type : 'BINARY' , op, left, right}     }     return  left;   }   function  exponent ( ): Expr  {     let  left = unary ()     if  (match ('STAR_STAR' )) {       const  op = previous ()       const  right = exponent ()       return  {type : 'BINARY' , left, op, right}     }     return  left   }   function  unary ( ): Expr  {     if  (match ('MINUS' , 'PLUS' )) {       const  op = previous ()       const  expr = unary ()       return  {type : 'UNARY' , op, expr}     }     return  primary ()   }   function  primary ( ): Expr  {     if  (match ('LEFT_PAREN' )) {       const  res = expr ()       consume ('RIGHT_PAREN' , "Expect ')'" )       return  {type : 'GROUPING' , expr : res}     }     if  (match ('NUMBER' , 'STRING' )) {       const  res = previous ()       if  (!(res.type  === 'NUMBER'  || res.type  === 'STRING' )) {         throw  new  Error ('Impossible' )       }       return  {type : 'LITERAL' , value : res.literal }     }     throw  error (peek (), 'Expect expression' )   }   return  expr () }
 
打印和执行 最难的部分已经结束了,现在只欠打印和计算了,而它们本身也没啥难度(和意思),这里实现lisp形式和逆波兰表达式(不允许一元运算符)形式打印。
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 function  printAst (expr: Expr ): string  {   if  (expr.type  === 'LITERAL' ) {     if  (typeof  expr.value  === 'number' ) {       return  expr.value .toString ()     }     return  `"${expr.value} "`    }   if  (expr.type  === 'BINARY' ) {     const  {op, left, right} = expr     return  `(${op.lexeme}  ${printAst(left)}  ${printAst(right)} )`    }   if  (expr.type  === 'GROUPING' ) {     return  printAst (expr.expr )   }   const  {op, expr : e} = expr   return  `(${op.lexeme}  ${printAst(e)} )`  }function  reversePolishNotation (expr: Expr ): string  {   if  (expr.type  === 'LITERAL' ) {     if  (typeof  expr.value  === 'number' ) {       return  expr.value .toString ()     }     return  `"${expr.value} "`    }   if  (expr.type  === 'UNARY' ) {     throw  new  Error ("逆波兰表达式不允许一元运算符" )   }      if  (expr.type  === 'GROUPING' ) {     return  reversePolandExpr (expr.expr )   }   const  {op, left, right} = expr   const  leftRes = reversePolandExpr (left)   const  rightRes = reversePolandExpr (right)   return  `${leftRes}  ${rightRes}  ${op.lexeme} `  }function  evaluate (expr: Expr ): string  | number  | Error  {   function  error (token: Token, msg: string  ): Error  {     return  new  Error (`计算错误,${msg} 。行数:${token.line} ` )   }   function  go (expr: Expr ): string  | number  {     if  (expr.type  === 'LITERAL' ) {       return  expr.value      }     if  (expr.type  === 'GROUPING' ) {       return  go (expr.expr );     }     if  (expr.type  === 'UNARY' ) {       const  {op , expr : subExpr} = expr       const  res = go (subExpr)       if  (op.type  === 'MINUS' ) {         if  (typeof  res === 'string' ) {           throw  error (op, "Unary '-' cannot apply to string" )         }         return  -res;       }       if  (op.type  === 'PLUS' ) {                  if  (typeof  res === 'string' ) {           const  numRes = parseInt (res)           if  (isNaN (numRes)) {             throw  error (op, `string "${res} " cannot cast to number` )           }           return  numRes         }         return  res       }       throw  new  Error ("Impossible" )     }     const  {op, left, right} = expr     const  l = go (left)     const  r = go (right)     if  (op.type  === 'PLUS' ) {              if  (typeof  l === 'string'  || typeof  r === 'string' ) {         return  l.toString () + r.toString ()       }       return  l + r     }     if  (op.type  === 'MINUS' ) {       if  (typeof  l === 'string'  || typeof  r === 'string' ) {         throw  error (op, "Binary '-' cannot apply to string" )       }       return  l - r     }     if  (op.type  === 'STAR' ) {       if  (typeof  l === 'string'  && typeof  r === 'string' ) {         throw  error (op, "Binary '*' cannot apply to string and string" )       }       if  (typeof  l === 'number'  && typeof  r === 'number' ) {         return  l * r       }       if  (typeof  l === 'number' ) {         return  r.toString ().repeat (l)       }        if  (typeof  r === 'number' ) {         return  l.toString ().repeat (r)       }        throw  new  Error ('impossible' )     }     if  (op.type  === 'PERCENT' ) {       if  (typeof  l === 'string'  || typeof  r === 'string' ) {         throw  error (op, "Binary '%' cannot apply to string" )       }       return  l % r     }          if  (op.type  === 'SLASH' ) {       if  (typeof  l === 'string'  || typeof  r === 'string' ) {         throw  error (op, "Binary '/' cannot apply to string" )       }       return  l / r     }     if  (op.type  === 'STAR_STAR' ) {       if  (typeof  l === 'string'  || typeof  r === 'string' ) {         throw  error (op, "Binary '**' cannot apply to string" )       }       return  l ** r     }     throw  new  Error ('impossible' )   }   try  {     return  go (expr)   } catch  (e) {     return  e as  Error    } }const  code = '+("111" + "222") * 2 ** 3' const  tokens = readTokens (code)const  expr = parse (tokens)console .log (printAst (expr)) console .log (evaluate(expr))