为了巩固和确认当前成果，换一个语言去把词法分析、递归下降语法分析再做一遍，其中使用正则表达式去实现词法分析，实现一下解析四则运算表达式。考虑到相关需求，使用 ts 去进行实现，理由如下：

之前在 java 里写解析器是逐字符来的，这次想试试用正则，而 js 有正则字面量，写正则表达式比 java 舒服一些，特别是处理转义符的时候（虽然这里估计用不到很复杂的）
ts 支持代数数据类型，用来对 Token 和 AST 建模很方便
js 方便做AST的可视化……本来想做的，但发现直接展示的话没啥乐子，想做个步进又发现过于复杂了。先摆了，将来再说

同时也期待这玩意能作为一个原型，打算等后面把 jlox 实现完后整个给它搬到浏览器上，如果我真的学的完的话。

该四则运算语言中，支持+，-，*，/，**（乘方，没有这个就无聊了）和括号，优先级、结合性和 js 中的一致。为了让它更整蛊，字面量允许字符串和数字，其中-，/，%，** 等不允许左右两边出现字符串字面量（这个限制在计算的时候:P）。

词法分析

词法分析需要定义词素，定义下面的 Token 类型（虽然这里会忽略空白，但为了后面能正确维护 Scanner 的状态，还是需要知道空白的数量以及其中包含多少个换行符的，所以另外定义了一个 BLANK token，但后续会直接扔掉）：

type TokenValue = {
  type: 'NUMBER',
  literal: number
} | {
  type: 'STRING',
  literal: string
} | {
  type: 'PLUS' | 'MINUS' | 'SLASH' | 'STAR' | 'PERCENT' | 'LEFT_PAREN' | 'RIGHT_PAREN' | 'STAR_STAR' | 'BLANK' | 'EOF'
}
type Token = TokenValue & {
  lexeme: string, // 原始的词素
  line: number // 行数
}

然后，需要为单字符的 token，双字符的 token，字符串字面量，数字字面量，还有空白，为它们分别定义正则表达式，然后把它们用|拼接到一起，并且给予 g flag——这让正则表达式能够包含状态，设置正则从字符串的特定位置去开始进行匹配。

要注意最长匹配原则——如果**能匹配，就不要匹配成两个*，因此各 token 的顺序需要被正确处理——如果两个正则表达式有歧义，较长的要放前面。

// 该正则的各位置对应 token 如下：
// 1: STAR_STAR
// 2: PLUS, MINUS, STAR, SLASH, PERCENT, LEFT_PAREN, RIGHT_PAREN
// 3: STRING
// 4: NUMBER
// 5: BLANK
const REGEXP = (() => {
  const DOUBLE_TOKEN_REGEXP = /(\*\*)/
  const SINGLE_TOKEN_REGEXP = /(\+|\-|\*|\/|\%|\(|\))/
  const STRING_LITERAL_REGEXP = /"(.*?)(?<!\\)"/ // (?<!y)x 是负向否定预查，它会匹配这样的 x，它前面不是 y，这里是为了跳过、"
  const NUMBER_LITERAL_REGEXP = /(\d+(?:\.\d+)?)/ // 支持 1 和 1.0 这样的形式 ，?: 表示忽略该分组
  const IGNORE_TOKEN_REGEXP = /(\s+)/
  const ANOTHER_STRING_LITERAL_REGEXP = /"((?:[^"\\]|\\.)*)"/ // GPT 给的另一个字符串字面量正则
  return new RegExp([
    DOUBLE_TOKEN_REGEXP,
    SINGLE_TOKEN_REGEXP,
    STRING_LITERAL_REGEXP,
    NUMBER_LITERAL_REGEXP,
    IGNORE_TOKEN_REGEXP
  ].map(regexp => regexp.source).join('|'), 'g')
})()

function tryMatch(code: string, startIndex: number, line: number): Token {
  // lastIndex 是正则的开始匹配位置
  REGEXP.lastIndex = startIndex
  const match = REGEXP.exec(code)
  // 如果没有匹配到，或者并非从开始位置匹配到（这证明字符串开始位置有非法字符）
  if (!match || match.index != startIndex) {
    throw new Error(`词法错误！行数：${line}`)
  }
  
  if (match[1] != undefined) {
    return { type: 'STAR_STAR', line, lexeme: match[1] }
  }
  if (match[2] != undefined) {
    const map = { '+': 'PLUS', '-': 'MINUS', '*': 'STAR', '/': 'SLASH', '%': 'PERCENT', '(': 'LEFT_PAREN', ')': 'RIGHT_PAREN' } as const
    return { type: map[match[2] as keyof typeof map], lexeme: match[2], line }
  }
  if (match[3] != undefined) {
    return { type: 'STRING', lexeme: `"${match[3]}"`, line, literal: match[3]}
  }
  if (match[4] != undefined) {
    return { type: 'NUMBER', lexeme: match[4], line, literal: parseInt(match[4])}
  }
  if (match[5] != undefined) {
    return { type: 'BLANK', lexeme: match[5], line }
  }
  console.log(match)
  throw new Error("Impossible")
}

function readTokens(code: string): Token[] {
  // 使用正则的话，只需要维护下一个词素的开始位置就行，因此这里把 start 直接给去掉了
  let line = 1;
  let current = 0;
  const res: Token[] = []

  while (current < code.length) {
    const token = tryMatch(code, current, line)
    if (token.type !== 'BLANK'){
      res.push(token)
    }
    const lexeme = token.lexeme
    // 维护 current 和 line
    current += lexeme.length 
    for (const char of lexeme) {
      if (char === '\n') line++
    }
  }
  res.push({ type: 'EOF', line, lexeme: '' })
  return res
}

逻辑比 java 还清晰些……主要是不需要同时维护两个指针，不需要在字符的粒度上去进行编码，缺点是无法给予清晰的报错。

console.log(readTokens(`  -1 + 2.3 
2+2.3+"123"+3.4-1*2/3**4* *5%6
7(1+2) "11\\"23\\"44"`))
/*
[
  { type: 'MINUS', lexeme: '-', line: 1 },
  { type: 'NUMBER', lexeme: '1', line: 1, literal: 1 },
  { type: 'PLUS', lexeme: '+', line: 1 },
  { type: 'NUMBER', lexeme: '2.3', line: 1, literal: 2 },
  { type: 'NUMBER', lexeme: '2', line: 2, literal: 2 },
  { type: 'PLUS', lexeme: '+', line: 2 },
  { type: 'NUMBER', lexeme: '2.3', line: 2, literal: 2 },
  { type: 'PLUS', lexeme: '+', line: 2 },
  { type: 'STRING', lexeme: '"123"', line: 2, literal: '123' },
  { type: 'PLUS', lexeme: '+', line: 2 },
  { type: 'NUMBER', lexeme: '3.4', line: 2, literal: 3 },
  { type: 'MINUS', lexeme: '-', line: 2 },
  { type: 'NUMBER', lexeme: '1', line: 2, literal: 1 },
  { type: 'STAR', lexeme: '*', line: 2 },
  { type: 'NUMBER', lexeme: '2', line: 2, literal: 2 },
  { type: 'SLASH', lexeme: '/', line: 2 },
  { type: 'NUMBER', lexeme: '3', line: 2, literal: 3 },
  { type: 'STAR_STAR', line: 2, lexeme: '**' },
  { type: 'NUMBER', lexeme: '4', line: 2, literal: 4 },
  { type: 'STAR', lexeme: '*', line: 2 },
  { type: 'NUMBER', lexeme: '5', line: 2, literal: 5 },
  { type: 'PERCENT', lexeme: '%', line: 2 },
  { type: 'NUMBER', lexeme: '6', line: 2, literal: 6 },
  { type: 'NUMBER', lexeme: '7', line: 3, literal: 7 },
  { type: 'LEFT_PAREN', lexeme: '(', line: 3 },
  { type: 'NUMBER', lexeme: '1', line: 3, literal: 1 },
  { type: 'PLUS', lexeme: '+', line: 3 },
  { type: 'NUMBER', lexeme: '2', line: 3, literal: 2 },
  { type: 'RIGHT_PAREN', lexeme: ')', line: 3 },
  {
    type: 'STRING',
    lexeme: '"11\\"23\\"44"',
    line: 3,
    literal: '11\\"23\\"44'
  },
  { type: 'EOF', line: 3, lexeme: '' }
]
*/

语法分析

然后是语法分析，首先确认这些运算符的优先级和结合性，下表优先级从低到高。

运算符	结合性
`-` `+`	左
`*` `/` `%`	左
`**`	右
unary `-` `+`	右

根据优先级，定义相应 BNF，注意这里的乘方的规则 Exponent -> Unary ("**" Exponent)?——乘方是右结合的，因此右方要允许其它乘方的出现且为它的右子树，这也体现在 parser 中。

Expr     -> Term
Term     -> Factor (("-" | "+") Factor)*
Factor   -> Exponent (("*" | "/" | "%") Exponent)*
Exponent -> Unary ("**" Exponent)?  # 注意这里！
Unary    -> ("+" | "-") Unary | Primary
Primary  -> NUMBER | STRING | "(" Expr ")"

说到底递归下降法为啥是这样呢，该怎样建立它的心智模型呢……想不通啊想不通

然后实现解析器，这里实际上只有三种类型了——一元操作，二元操作和字面量，这里没必要去表示括号，但还是给它带上。

type Expr = {
  type: 'UNARY', op: Token, expr: Expr
} | {
  type: 'BINARY', left: Expr, op: Token,  right: Expr
} | {
  type: 'LITERAL', value: string | number
} | {
  type: 'GROUPING', expr: Expr
}

function parse(tokens: Token[]): Expr {
  // 同样的，需要一个 current 状态
  // 但和 Scanner 不同，因为递归下降中每个规则都要对应一个函数，没法很好地直接维护 current，因此需要定义一些原子的函数去读取 token 和获取 current
  let current = 0;
  const isAtEnd = () => current >= tokens.length
  const peek = () => tokens[current]
  const advance = () => isAtEnd() ? peek() : tokens[current++]
  const previous = () => tokens[current - 1]
  function match(...tokenTypes: Token['type'][]): boolean {
    if (isAtEnd()) return false
    const current = peek()
    for (const tokenType of tokenTypes) {
      if (current.type === tokenType) {
        advance()
        return true
      }
    }
    return false
  }
  function error(token: Token, msg: string): Error {
    throw new Error(`语法错误，#{msg}。行数：${token.line}`)
  }
  function consume(tokenType: Token['type'], msg: string) {
    if (match(tokenType)) {
      return
    }
    throw error(peek(), msg)
  }

  /*
Expr     -> Term
Term     -> Factor (("-" | "+") Factor)*
Factor   -> Exponent (("*" | "/" | "%") Exponent)*
Exponent -> Unary ("**" Exponent)?   # 右结合！
Unary    -> ("+" | "-") Unary | Primary
Primary  -> NUMBER | STRING | "(" Expr ")"
  */
  function expr(): Expr {
    return term()
  }
  function term(): Expr {
    let left = factor()
    while (match('MINUS', 'PLUS')) {
      const op = previous()
      const right = factor()
      left = {type: 'BINARY', op, left, right}
    }
    return left;
  }
  function factor(): Expr {
    let left = exponent()
    while (match('STAR', 'SLASH', 'PERCENT')) {
      const op = previous()
      const right = exponent()
      left = {type: 'BINARY', op, left, right}
    }
    return left;
  }
  function exponent(): Expr {
    let left = unary()
    if (match('STAR_STAR')) {
      const op = previous()
      const right = exponent()
      return {type: 'BINARY', left, op, right}
    }
    return left
  }
  function unary(): Expr {
    if (match('MINUS', 'PLUS')) {
      const op = previous()
      const expr = unary()
      return {type: 'UNARY', op, expr}
    }
    return primary()
  }
  function primary(): Expr {
    if (match('LEFT_PAREN')) {
      const res = expr()
      consume('RIGHT_PAREN', "Expect ')'")
      return {type: 'GROUPING', expr: res}
    }
    if (match('NUMBER', 'STRING')) {
      const res = previous()
      if (!(res.type === 'NUMBER' || res.type === 'STRING')) {
        throw new Error('Impossible')
      }
      return {type: 'LITERAL', value: res.literal}
    }
    throw error(peek(), 'Expect expression')
  }
  return expr()
}

打印和执行

最难的部分已经结束了，现在只欠打印和计算了，而它们本身也没啥难度（和意思），这里实现lisp形式和逆波兰表达式（不允许一元运算符）形式打印。

// lisp形式打印
function printAst(expr: Expr): string {
  if (expr.type === 'LITERAL') {
    if (typeof expr.value === 'number') {
      return expr.value.toString()
    }
    return `"${expr.value}"`
  }
  if (expr.type === 'BINARY') {
    const {op, left, right} = expr
    return `(${op.lexeme} ${printAst(left)} ${printAst(right)})`
  }
  if (expr.type === 'GROUPING') {
    return printAst(expr.expr)
  }
  const {op, expr: e} = expr
  return `(${op.lexeme} ${printAst(e)})`
}

// 容易发现逆波兰表达式是后序遍历
function reversePolishNotation(expr: Expr): string {
  if (expr.type === 'LITERAL') {
    if (typeof expr.value === 'number') {
      return expr.value.toString()
    }
    return `"${expr.value}"`
  }
  if (expr.type === 'UNARY') {
    throw new Error("逆波兰表达式不允许一元运算符")
  }
  
  if (expr.type === 'GROUPING') {
    return reversePolandExpr(expr.expr)
  }
  const {op, left, right} = expr
  const leftRes = reversePolandExpr(left)
  const rightRes = reversePolandExpr(right)
  return `${leftRes} ${rightRes} ${op.lexeme}`
}




// 计算，如果出现错误则返回Error
function evaluate(expr: Expr): string | number | Error {
  function error(token: Token, msg: string): Error {
    return new Error(`计算错误，${msg}。行数：${token.line}`)
  }
  function go(expr: Expr): string | number {
    if (expr.type === 'LITERAL') {
      return expr.value
    }
    if (expr.type === 'GROUPING') {
      return go(expr.expr);
    }
    if (expr.type === 'UNARY') {
      const {op , expr: subExpr} = expr
      const res = go(subExpr)
      if (op.type === 'MINUS') {
        if (typeof res === 'string') {
          throw error(op, "Unary '-' cannot apply to string")
        }
        return -res;
      }
      if (op.type === 'PLUS') {
        // 对字符串会转成数字
        if (typeof res === 'string') {
          const numRes = parseInt(res)
          if (isNaN(numRes)) {
            throw error(op, `string "${res}" cannot cast to number`)
          }
          return numRes
        }
        return res
      }
      throw new Error("Impossible")
    }
    const {op, left, right} = expr
    const l = go(left)
    const r = go(right)
    if (op.type === 'PLUS') {
      // 如果任一操作数是字符串，使用字符串操作
      if (typeof l === 'string' || typeof r === 'string') {
        return l.toString() + r.toString()
      }
      return l + r
    }
    if (op.type === 'MINUS') {
      if (typeof l === 'string' || typeof r === 'string') {
        throw error(op, "Binary '-' cannot apply to string")
      }
      return l - r
    }
    if (op.type === 'STAR') {
      if (typeof l === 'string' && typeof r === 'string') {
        throw error(op, "Binary '*' cannot apply to string and string")
      }
      if (typeof l === 'number' && typeof r === 'number') {
        return l * r
      }
      if (typeof l === 'number') {
        return r.toString().repeat(l)
      } 
      if (typeof r === 'number') {
        return l.toString().repeat(r)
      } 
      throw new Error('impossible')
    }
    if (op.type === 'PERCENT') {
      if (typeof l === 'string' || typeof r === 'string') {
        throw error(op, "Binary '%' cannot apply to string")
      }
      return l % r
    }
    
    if (op.type === 'SLASH') {
      if (typeof l === 'string' || typeof r === 'string') {
        throw error(op, "Binary '/' cannot apply to string")
      }
      return l / r
    }
    if (op.type === 'STAR_STAR') {
      if (typeof l === 'string' || typeof r === 'string') {
        throw error(op, "Binary '**' cannot apply to string")
      }
      return l ** r
    }
    throw new Error('impossible')
  }
  try {
    return go(expr)
  } catch (e) {
    return e as Error
  }
}

const code = '+("111" + "222") * 2 ** 3'
const tokens = readTokens(code)
const expr = parse(tokens)
console.log(printAst(expr)) // (* (+ (+ "111" "222")) (** 2 3))
console.log(evaluate(expr)) // 889776

编译原理

本博客所有文章除特别声明外，均采用 CC BY-NC-SA 4.0 协议，转载请注明出处！

《Crafting Interpreters》学习笔记 2——状态，控制流，函数，语义分析上一篇

《Crafting Interpreters》学习笔记 1——词法分析，递归下降法下一篇

使用 TS 使用递归下降法实现解析四则运算

词法分析

语法分析

打印和执行