为了巩固和确认当前成果,换一个语言去把词法分析、递归下降语法分析再做一遍,其中使用正则表达式去实现词法分析,实现一下解析四则运算表达式。考虑到相关需求,使用 ts 去进行实现,理由如下:
之前在 java 里写解析器是逐字符来的,这次想试试用正则,而 js 有正则字面量,写正则表达式比 java 舒服一些,特别是处理转义符的时候(虽然这里估计用不到很复杂的)
ts 支持代数数据类型,用来对 Token 和 AST 建模很方便
js 方便做AST的可视化……本来想做的,但发现直接展示的话没啥乐子,想做个步进又发现过于复杂了。先摆了,将来再说
同时也期待这玩意能作为一个原型,打算等后面把 jlox 实现完后整个给它搬到浏览器上,如果我真的学的完的话。
该四则运算语言中,支持+
,-
,*
,/
,**
(乘方,没有这个就无聊了)和括号,优先级、结合性和 js 中的一致。为了让它更整蛊,字面量允许字符串和数字,其中-
,/
,%
,**
等不允许左右两边出现字符串字面量(这个限制在计算的时候:P)。
词法分析 词法分析需要定义词素,定义下面的 Token 类型(虽然这里会忽略空白,但为了后面能正确维护 Scanner 的状态,还是需要知道空白的数量以及其中包含多少个换行符的,所以另外定义了一个 BLANK token,但后续会直接扔掉):
type TokenValue = { type : 'NUMBER' , literal : number } | { type : 'STRING' , literal : string } | { type : 'PLUS' | 'MINUS' | 'SLASH' | 'STAR' | 'PERCENT' | 'LEFT_PAREN' | 'RIGHT_PAREN' | 'STAR_STAR' | 'BLANK' | 'EOF' }type Token = TokenValue & { lexeme : string , line : number }
然后,需要为单字符的 token,双字符的 token,字符串字面量,数字字面量,还有空白,为它们分别定义正则表达式,然后把它们用|
拼接到一起,并且给予 g flag——这让正则表达式能够包含状态,设置正则从字符串的特定位置去开始进行匹配。
要注意最长匹配原则——如果**
能匹配,就不要匹配成两个*
,因此各 token 的顺序需要被正确处理——如果两个正则表达式有歧义,较长的要放前面。
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 const REGEXP = (() => { const DOUBLE_TOKEN_REGEXP = /(\*\*)/ const SINGLE_TOKEN_REGEXP = /(\+|\-|\*|\/|\%|\(|\))/ const STRING_LITERAL_REGEXP = /"(.*?)(?<!\\)"/ const NUMBER_LITERAL_REGEXP = /(\d+(?:\.\d+)?)/ const IGNORE_TOKEN_REGEXP = /(\s+)/ const ANOTHER_STRING_LITERAL_REGEXP = /"((?:[^"\\]|\\.)*)"/ return new RegExp ([ DOUBLE_TOKEN_REGEXP , SINGLE_TOKEN_REGEXP , STRING_LITERAL_REGEXP , NUMBER_LITERAL_REGEXP , IGNORE_TOKEN_REGEXP ].map (regexp => regexp.source ).join ('|' ), 'g' ) })()function tryMatch (code: string , startIndex: number , line: number ): Token { REGEXP .lastIndex = startIndex const match = REGEXP .exec (code) if (!match || match.index != startIndex) { throw new Error (`词法错误!行数:${line} ` ) } if (match[1 ] != undefined ) { return { type : 'STAR_STAR' , line, lexeme : match[1 ] } } if (match[2 ] != undefined ) { const map = { '+' : 'PLUS' , '-' : 'MINUS' , '*' : 'STAR' , '/' : 'SLASH' , '%' : 'PERCENT' , '(' : 'LEFT_PAREN' , ')' : 'RIGHT_PAREN' } as const return { type : map[match[2 ] as keyof typeof map], lexeme : match[2 ], line } } if (match[3 ] != undefined ) { return { type : 'STRING' , lexeme : `"${match[3 ]} "` , line, literal : match[3 ]} } if (match[4 ] != undefined ) { return { type : 'NUMBER' , lexeme : match[4 ], line, literal : parseInt (match[4 ])} } if (match[5 ] != undefined ) { return { type : 'BLANK' , lexeme : match[5 ], line } } console .log (match) throw new Error ("Impossible" ) }function readTokens (code: string ): Token [] { let line = 1 ; let current = 0 ; const res : Token [] = [] while (current < code.length ) { const token = tryMatch (code, current, line) if (token.type !== 'BLANK' ){ res.push (token) } const lexeme = token.lexeme current += lexeme.length for (const char of lexeme) { if (char === '\n' ) line++ } } res.push ({ type : 'EOF' , line, lexeme : '' }) return res }
逻辑比 java 还清晰些……主要是不需要同时维护两个指针,不需要在字符的粒度上去进行编码,缺点是无法给予清晰的报错。
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 console .log (readTokens (` -1 + 2.3 2+2.3+"123"+3.4-1*2/3**4* *5%6 7(1+2) "11\\"23\\"44"` ))
语法分析 然后是语法分析,首先确认这些运算符的优先级和结合性,下表优先级从低到高。
运算符
结合性
-
+
左
*
/
%
左
**
右
unary -
+
右
根据优先级,定义相应 BNF,注意这里的乘方的规则 Exponent -> Unary ("**" Exponent)?
——乘方是右结合的,因此右方要允许其它乘方的出现且为它的右子树,这也体现在 parser 中。
Expr -> Term Term -> Factor (("-" | "+") Factor)* Factor -> Exponent (("*" | "/" | "%") Exponent)* Exponent -> Unary ("**" Exponent)? # 注意这里! Unary -> ("+" | "-") Unary | Primary Primary -> NUMBER | STRING | "(" Expr ")"
说到底递归下降法为啥是这样呢,该怎样建立它的心智模型呢……想不通啊想不通
然后实现解析器,这里实际上只有三种类型了——一元操作,二元操作和字面量,这里没必要去表示括号,但还是给它带上。
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 type Expr = { type : 'UNARY' , op : Token , expr : Expr } | { type : 'BINARY' , left : Expr , op : Token , right : Expr } | { type : 'LITERAL' , value : string | number } | { type : 'GROUPING' , expr : Expr }function parse (tokens: Token[] ): Expr { let current = 0 ; const isAtEnd = ( ) => current >= tokens.length const peek = ( ) => tokens[current] const advance = ( ) => isAtEnd () ? peek () : tokens[current++] const previous = ( ) => tokens[current - 1 ] function match (...tokenTypes: Token['type' ][] ): boolean { if (isAtEnd ()) return false const current = peek () for (const tokenType of tokenTypes) { if (current.type === tokenType) { advance () return true } } return false } function error (token: Token, msg: string ): Error { throw new Error (`语法错误,#{msg}。行数:${token.line} ` ) } function consume (tokenType: Token['type' ], msg: string ) { if (match (tokenType)) { return } throw error (peek (), msg) } function expr ( ): Expr { return term () } function term ( ): Expr { let left = factor () while (match ('MINUS' , 'PLUS' )) { const op = previous () const right = factor () left = {type : 'BINARY' , op, left, right} } return left; } function factor ( ): Expr { let left = exponent () while (match ('STAR' , 'SLASH' , 'PERCENT' )) { const op = previous () const right = exponent () left = {type : 'BINARY' , op, left, right} } return left; } function exponent ( ): Expr { let left = unary () if (match ('STAR_STAR' )) { const op = previous () const right = exponent () return {type : 'BINARY' , left, op, right} } return left } function unary ( ): Expr { if (match ('MINUS' , 'PLUS' )) { const op = previous () const expr = unary () return {type : 'UNARY' , op, expr} } return primary () } function primary ( ): Expr { if (match ('LEFT_PAREN' )) { const res = expr () consume ('RIGHT_PAREN' , "Expect ')'" ) return {type : 'GROUPING' , expr : res} } if (match ('NUMBER' , 'STRING' )) { const res = previous () if (!(res.type === 'NUMBER' || res.type === 'STRING' )) { throw new Error ('Impossible' ) } return {type : 'LITERAL' , value : res.literal } } throw error (peek (), 'Expect expression' ) } return expr () }
打印和执行 最难的部分已经结束了,现在只欠打印和计算了,而它们本身也没啥难度(和意思),这里实现lisp形式和逆波兰表达式(不允许一元运算符)形式打印。
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 function printAst (expr: Expr ): string { if (expr.type === 'LITERAL' ) { if (typeof expr.value === 'number' ) { return expr.value .toString () } return `"${expr.value} "` } if (expr.type === 'BINARY' ) { const {op, left, right} = expr return `(${op.lexeme} ${printAst(left)} ${printAst(right)} )` } if (expr.type === 'GROUPING' ) { return printAst (expr.expr ) } const {op, expr : e} = expr return `(${op.lexeme} ${printAst(e)} )` }function reversePolishNotation (expr: Expr ): string { if (expr.type === 'LITERAL' ) { if (typeof expr.value === 'number' ) { return expr.value .toString () } return `"${expr.value} "` } if (expr.type === 'UNARY' ) { throw new Error ("逆波兰表达式不允许一元运算符" ) } if (expr.type === 'GROUPING' ) { return reversePolandExpr (expr.expr ) } const {op, left, right} = expr const leftRes = reversePolandExpr (left) const rightRes = reversePolandExpr (right) return `${leftRes} ${rightRes} ${op.lexeme} ` }function evaluate (expr: Expr ): string | number | Error { function error (token: Token, msg: string ): Error { return new Error (`计算错误,${msg} 。行数:${token.line} ` ) } function go (expr: Expr ): string | number { if (expr.type === 'LITERAL' ) { return expr.value } if (expr.type === 'GROUPING' ) { return go (expr.expr ); } if (expr.type === 'UNARY' ) { const {op , expr : subExpr} = expr const res = go (subExpr) if (op.type === 'MINUS' ) { if (typeof res === 'string' ) { throw error (op, "Unary '-' cannot apply to string" ) } return -res; } if (op.type === 'PLUS' ) { if (typeof res === 'string' ) { const numRes = parseInt (res) if (isNaN (numRes)) { throw error (op, `string "${res} " cannot cast to number` ) } return numRes } return res } throw new Error ("Impossible" ) } const {op, left, right} = expr const l = go (left) const r = go (right) if (op.type === 'PLUS' ) { if (typeof l === 'string' || typeof r === 'string' ) { return l.toString () + r.toString () } return l + r } if (op.type === 'MINUS' ) { if (typeof l === 'string' || typeof r === 'string' ) { throw error (op, "Binary '-' cannot apply to string" ) } return l - r } if (op.type === 'STAR' ) { if (typeof l === 'string' && typeof r === 'string' ) { throw error (op, "Binary '*' cannot apply to string and string" ) } if (typeof l === 'number' && typeof r === 'number' ) { return l * r } if (typeof l === 'number' ) { return r.toString ().repeat (l) } if (typeof r === 'number' ) { return l.toString ().repeat (r) } throw new Error ('impossible' ) } if (op.type === 'PERCENT' ) { if (typeof l === 'string' || typeof r === 'string' ) { throw error (op, "Binary '%' cannot apply to string" ) } return l % r } if (op.type === 'SLASH' ) { if (typeof l === 'string' || typeof r === 'string' ) { throw error (op, "Binary '/' cannot apply to string" ) } return l / r } if (op.type === 'STAR_STAR' ) { if (typeof l === 'string' || typeof r === 'string' ) { throw error (op, "Binary '**' cannot apply to string" ) } return l ** r } throw new Error ('impossible' ) } try { return go (expr) } catch (e) { return e as Error } }const code = '+("111" + "222") * 2 ** 3' const tokens = readTokens (code)const expr = parse (tokens)console .log (printAst (expr)) console .log (evaluate(expr))