kmizuの日記

プログラミングや形式言語に関係のあることを書いたり書かなかったり。

PEGのパーサ in Scala

ScalaのパーザコンビネータでParsing Expression Grammar(PEG)の文法を定義してみました。PEGのインタプリタ作る副産物としてできたもので、構文木も作ってくれます。まあ、使いたい人が居るかは激しく疑問ですが、パーサコンビネータの使い方の参考になるかもしれません。ちなみに、

  • 各ルールはセミコロンで終端する
  • ルールの定義に、<-だけでなく、=も使える

点は、Fordの論文に載っているPEGの定義と異なっている部分です。

object Ast {
  trait HasPosition { def pos: Pos }
  case class Pos(line: Int, column: Int)
  case class Grammar(pos: Pos, start: Symbol, rules: List[Rule]) extends HasPosition
  case class Rule(pos: Pos, name: Symbol, body: Exp) extends HasPosition
  sealed trait Exp extends HasPosition
  case class Seq(pos: Pos, lhs: Exp, rhs: Exp) extends Exp
  case class Alt(pos: Pos, lhs: Exp, rhs: Exp) extends Exp
  case class Rep0(pos: Pos, body: Exp) extends Exp
  case class Rep1(pos: Pos, body: Exp) extends Exp
  case class Opt(pos: Pos, body: Exp) extends Exp
  case class AndPred(pos: Pos, body: Exp) extends Exp
  case class NotPred(pos: Pos, body: Exp) extends Exp
  case class Str(pos: Pos, target: String) extends Exp
  case class Wildcard(pos: Pos) extends Exp
  case class CharClass(pos: Pos, elems: List[CharClassElement]) extends Exp
  case class Ident(pos: Pos, name: Symbol) extends Exp
  sealed trait CharClassElement
  case class OneChar(ch: Char) extends CharClassElement
  case class CharRange(from: Char, to: Char) extends CharClassElement
}
import scala.util.parsing.combinator._
import scala.util.parsing.input
import input._
import java.io._
import Ast._

object PegParser {
  case class ParseException(pos: Pos, msg: String) extends Exception(pos + msg)
  
  /*
   * See
   * Bryan Ford, "Parsing Expression Grammars: A Recognition-Based Syntactic Foundation",
   * Symposium on Principles of Programming Languages,2004.
   */
  object ParserCore extends Parsers {
    type Elem = Char 
    private val any: Parser[Char] = elem(".", c => c != CharSequenceReader.EofCh)
    private def chr(c: Char): Parser[Char] = c
    private def crange(f: Char, t: Char): Parser[Char] = elem("[]", c => f <= c && c <= t)
    private def cset(cs: Char*): Parser[Char] = elem("[]", c => cs.findIndexOf(_ == c) >= 0)
    private val escapeMap: Map[Char, Char] = Map(
      'n' -> '\n', 'r' -> '\r', 't' -> '\t', '\'' -> '\'', '"' -> '"', '[' -> '[',
      ']' -> ']', '\\' -> '\\'
    )
    lazy val GRAMMER: Parser[Grammar] = (loc <~ Spacing) ~ Definition.+ <~ EndOfFile ^^ {
      case pos ~ rules => Grammar(Pos(pos.line, pos.column), rules.head.name, rules)
    }
    lazy val Definition: Parser[Rule] = (Identifier <~ (LEFTARROW | EQ)) ~ 
      Expression <~ SEMI_COLON ^^ {
      case n ~ b => Rule(n.pos, n.name, b)
    }      
    lazy val Expression: Parser[Exp] = Sequence ~ (SLASH ~> Sequence).* ^^ {
      case x ~ xs => xs.foldLeft(x){(a, y) => Alt(y.pos, a, y)}
    }
    lazy val Sequence: Parser[Exp] = Prefix.+ ^^ {case x::xs => 
      xs.foldLeft(x){(a, y) => Seq(y.pos, a, y)}
    }
    lazy val Prefix: Parser[Exp] = (
      (loc <~ AND) ~ Suffix ^^ { case pos ~ e => AndPred(Pos(pos.line, pos.column), e) }
    | (loc <~ NOT) ~ Suffix ^^ { case pos ~ e => NotPred(Pos(pos.line, pos.column), e) }
    | Suffix
    )
    lazy val Suffix: Parser[Exp] = (
      loc ~ Primary <~ QUESTION ^^ { case pos ~ e => Opt(Pos(pos.line, pos.column), e) }
    | loc ~ Primary <~ STAR ^^ { case pos ~ e => Rep0(Pos(pos.line, pos.column), e) }
    | loc ~ Primary <~ PLUS ^^ { case pos ~ e => Rep1(Pos(pos.line, pos.column), e) }
    | Primary
    )
    lazy val Primary: Parser[Exp] = (
      Identifier /*<~ not(LEFTARROW | EQ)*/
    | OPEN ~> Expression <~ CLOSE
    | Literal
    | CLASS
    | loc <~ DOT ^^ { case pos => Wildcard(Pos(pos.line, pos.column)) }
    )
    lazy val loc: Parser[Position] = Parser{reader => Success(reader.pos, reader)}    
    lazy val Identifier: Parser[Ident] = loc ~ IdentStart ~ IdentCont.* <~Spacing ^^ {
      case pos ~ s ~ c => Ident(Pos(pos.line, pos.column), Symbol("" + s + c.foldLeft("")(_ + _)))
    }
    lazy val IdentStart: Parser[Char] = crange('a','z') | crange('A','Z') | '_'
    lazy val IdentCont: Parser[Char] = IdentStart | crange('0','9')
    lazy val Literal: Parser[Str] = loc ~ (
      chr('\'') ~> (not('\'') ~> CHAR).* <~ chr('\'') <~ Spacing
    | chr('"') ~> (not('"') ~> CHAR).* <~ chr('"') <~ Spacing
    ) ^^ {
      case pos ~ cs => Str(Pos(pos.line, pos.column), cs.foldLeft(""){(acc, n) => acc + n})
    }
    lazy val CLASS: Parser[CharClass] = (loc <~ chr('[')) ~ (not(chr(']')) ~> Range).* <~ ']' ~> Spacing ^^ {
      case (pos ~ rs) => CharClass(Pos(pos.line, pos.column), rs);
    }
    lazy val Range: Parser[CharClassElement] = (
      CHAR ~ '-' ~ CHAR ^^ { case f~_~t => CharRange(f, t) }
    | CHAR ^^ { case c => OneChar(c) }
    )
    lazy val CHAR: Parser[Char] = ( 
      chr('\\') ~ cset('n','r','t','\'','"','[',']','\\') ^^ { case _ ~ c => escapeMap(c) }
    | chr('\\') ~ crange('0','2') ~ crange('0','7') ~ crange('0','7') ^^ { 
        case _ ~ a ~ b ~ c => Integer.parseInt("" + a + b + c, 8).toChar
      }
    | chr('\\') ~ crange('0','7') ~ opt(crange('0','7')) ^^ {
        case _ ~ a ~ Some(b) => Integer.parseInt("" + a + b, 8).toChar
        case _ ~ a ~ _ => Integer.parseInt("" + a, 8).toChar
      }
    | not('\\') ~ any ^^ { case _ ~ c => c}
    )
    lazy val SEMI_COLON = ';' <~ Spacing
    lazy val EQ = chr('=') <~ Spacing
    lazy val LEFTARROW = chr('<') ~ '-' <~ Spacing
    lazy val SLASH = '/' <~ Spacing
    lazy val AND = '&' <~ Spacing
    lazy val NOT = '!' <~ Spacing
    lazy val QUESTION = '?' <~ Spacing
    lazy val STAR = '*' <~ Spacing
    lazy val PLUS = '+' <~ Spacing
    lazy val OPEN = '(' <~ Spacing
    lazy val CLOSE = ')' <~ Spacing
    lazy val DOT = '.' <~ Spacing
    lazy val Spacing = (Space | Comment).*
    lazy val Comment = chr('#') ~ (not(EndOfLine) ~ any).* ~ EndOfLine
    lazy val Space = chr(' ') | chr('\t') | EndOfLine
    lazy val EndOfLine = chr('\r') ~ chr('\n') | chr('\n') | chr('\r')
    lazy val EndOfFile = not(any)
  }
  
  def parse(fileName: String, content: java.io.Reader): Grammar = {
    ParserCore.GRAMMER(input.StreamReader(content)) match {
      case ParserCore.Success(node, _) => node
      case ParserCore.Failure(msg, rest) => 
        val pos = rest.pos
        throw new ParseException(Pos(pos.line, pos.column), msg)
      case ParserCore.Error(msg, rest) =>
        val pos = rest.pos
        throw new ParseException(Pos(pos.line, pos.column), msg)        
    }
  }
  
  def main(args: Array[String]) {
    val g = parse(args(0), new FileReader(args(0)))
    println(g)
  }
}