397 lines
6.8 KiB
Go
397 lines
6.8 KiB
Go
package main
|
|
|
|
import (
|
|
"fmt"
|
|
"math/big"
|
|
"strconv"
|
|
"strings"
|
|
"unicode/utf8"
|
|
|
|
"git.bonsai.cool/kayprish/pj1/pj1-go/util"
|
|
)
|
|
|
|
type TokenType int
|
|
|
|
const (
|
|
// Single-character tokens.
|
|
LEFT_PAREN TokenType = iota
|
|
RIGHT_PAREN
|
|
LEFT_BRACE
|
|
RIGHT_BRACE
|
|
COMMA
|
|
DOT
|
|
MINUS
|
|
PLUS
|
|
SEMICOLON
|
|
|
|
// One or two character tokens.
|
|
STAR
|
|
STAR_STAR
|
|
BANG
|
|
BANG_EQUAL
|
|
EQUAL
|
|
EQUAL_EQUAL
|
|
GREATER
|
|
GREATER_EQUAL
|
|
LESS
|
|
LESS_EQUAL
|
|
SLASH
|
|
SLASH_DOT
|
|
SLASH_UNDERSCORE
|
|
SLASH_MODULO
|
|
MODULO
|
|
|
|
// Literals
|
|
IDENTIFIER
|
|
STRING
|
|
INTEGER
|
|
FLOAT
|
|
|
|
// Keywords.
|
|
AND
|
|
CLASS
|
|
ELSE
|
|
FALSE
|
|
FOR
|
|
FUN
|
|
IF
|
|
NIL
|
|
OR
|
|
PRINT
|
|
RETURN
|
|
SUPER
|
|
THIS
|
|
TRUE
|
|
VAR
|
|
WHILE
|
|
|
|
EOF
|
|
)
|
|
|
|
//go:generate go run golang.org/x/tools/cmd/stringer -type=TokenType
|
|
|
|
var keywords = map[string]TokenType{
|
|
"and": AND,
|
|
"class": CLASS,
|
|
"else": ELSE,
|
|
"false": FALSE,
|
|
"for": FOR,
|
|
"fun": FUN,
|
|
"if": IF,
|
|
"nil": NIL,
|
|
"or": OR,
|
|
"print": PRINT,
|
|
"return": RETURN,
|
|
"super": SUPER,
|
|
"this": THIS,
|
|
"true": TRUE,
|
|
"var": VAR,
|
|
"while": WHILE,
|
|
}
|
|
|
|
type Token struct {
|
|
ttype TokenType
|
|
Lexeme string
|
|
literal interface{}
|
|
line int
|
|
}
|
|
|
|
func NewToken(ttype TokenType, lexeme string, literal interface{}, line int) Token {
|
|
t := Token{ttype, lexeme, literal, line}
|
|
return t
|
|
}
|
|
|
|
func (t Token) String() string {
|
|
return fmt.Sprintf("%v %v %v", t.ttype, t.Lexeme, t.literal)
|
|
}
|
|
|
|
type Lexer struct {
|
|
source string
|
|
Tokens []Token
|
|
|
|
startByte int
|
|
currentByte int
|
|
startRune int
|
|
currentRune int
|
|
line int
|
|
}
|
|
|
|
func NewLexer(source string) Lexer {
|
|
l := Lexer{source, []Token{}, 0, 0, 0, 0, 1}
|
|
return l
|
|
}
|
|
|
|
func (l *Lexer) ScanTokens() {
|
|
for !l.atEnd() {
|
|
l.startByte = l.currentByte
|
|
l.startRune = l.currentRune
|
|
l.scanToken()
|
|
}
|
|
|
|
l.Tokens = append(l.Tokens, NewToken(EOF, "", nil, l.line))
|
|
}
|
|
|
|
func (l Lexer) atEnd() bool {
|
|
return l.currentByte >= len(l.source)
|
|
}
|
|
|
|
func (l *Lexer) scanToken() {
|
|
c := l.advance()
|
|
switch c {
|
|
case '(':
|
|
l.addSimpleToken(LEFT_PAREN)
|
|
case ')':
|
|
l.addSimpleToken(RIGHT_PAREN)
|
|
case '{':
|
|
l.addSimpleToken(LEFT_BRACE)
|
|
case '}':
|
|
l.addSimpleToken(RIGHT_BRACE)
|
|
case ',':
|
|
l.addSimpleToken(COMMA)
|
|
case '.':
|
|
l.addSimpleToken(DOT)
|
|
case '-':
|
|
l.addSimpleToken(MINUS)
|
|
case '+':
|
|
l.addSimpleToken(PLUS)
|
|
case ';':
|
|
l.addSimpleToken(SEMICOLON)
|
|
case '*':
|
|
if l.match('*') {
|
|
l.addSimpleToken(STAR_STAR)
|
|
} else {
|
|
l.addSimpleToken(STAR)
|
|
}
|
|
l.addSimpleToken(STAR)
|
|
case '!':
|
|
if l.match('=') {
|
|
l.addSimpleToken(BANG_EQUAL)
|
|
} else {
|
|
l.addSimpleToken(BANG)
|
|
}
|
|
case '=':
|
|
if l.match('=') {
|
|
l.addSimpleToken(EQUAL_EQUAL)
|
|
} else {
|
|
l.addSimpleToken(EQUAL)
|
|
}
|
|
case '<':
|
|
if l.match('=') {
|
|
l.addSimpleToken(LESS_EQUAL)
|
|
} else {
|
|
l.addSimpleToken(LESS)
|
|
}
|
|
case '>':
|
|
if l.match('=') {
|
|
l.addSimpleToken(GREATER_EQUAL)
|
|
} else {
|
|
l.addSimpleToken(GREATER)
|
|
}
|
|
case '%':
|
|
l.addSimpleToken(MODULO)
|
|
case '/':
|
|
if l.match('/') {
|
|
// A comment goes until the end of the line
|
|
for l.peek() != '\n' && !l.atEnd() {
|
|
l.advance()
|
|
}
|
|
} else if l.match('*') {
|
|
l.advance()
|
|
l.advance() // swallow the "/*"
|
|
nesting := 1
|
|
// A comment goes until we reach */, however,
|
|
// the comments can also nest
|
|
for {
|
|
if l.peek() == '/' && l.peekNext() == '*' {
|
|
nesting++
|
|
l.advance()
|
|
l.advance()
|
|
} else if l.peek() == '*' && l.peekNext() == '/' {
|
|
nesting--
|
|
l.advance()
|
|
l.advance()
|
|
} else {
|
|
if l.peek() == '\n' {
|
|
l.line++
|
|
}
|
|
l.advance()
|
|
}
|
|
}
|
|
} else if l.match('.') {
|
|
l.addSimpleToken(SLASH_DOT)
|
|
} else if l.match('_') {
|
|
l.addSimpleToken(SLASH_UNDERSCORE)
|
|
} else if l.match('%') {
|
|
l.addSimpleToken(SLASH_MODULO)
|
|
} else {
|
|
l.addSimpleToken(SLASH)
|
|
}
|
|
case ' ', '\r', '\t':
|
|
case '\n':
|
|
l.line++
|
|
case '"':
|
|
l.str()
|
|
|
|
default:
|
|
if isDigit(c) {
|
|
l.number()
|
|
} else if isAlpha(c) {
|
|
l.identifier()
|
|
} else {
|
|
// TODO: if there are multiple bad characters
|
|
// coalesce similar errors into one
|
|
util.Error(l.line, fmt.Sprintf("Unexpected character, %v.", c))
|
|
}
|
|
}
|
|
}
|
|
|
|
func (l *Lexer) identifier() {
|
|
for isAlphaNumeric(l.peek()) {
|
|
l.advance()
|
|
}
|
|
|
|
text := l.source[l.startByte:l.currentByte]
|
|
ttype, ok := keywords[text]
|
|
|
|
if !ok {
|
|
ttype = IDENTIFIER
|
|
}
|
|
|
|
l.addSimpleToken(ttype)
|
|
}
|
|
|
|
func (l *Lexer) number() {
|
|
isInt := true
|
|
for isDigit(l.peek()) {
|
|
l.advance()
|
|
}
|
|
|
|
if l.peek() == '.' && isDigit(l.peekNext()) {
|
|
l.advance()
|
|
isInt = false
|
|
|
|
for isDigit(l.peek()) {
|
|
l.advance()
|
|
}
|
|
}
|
|
|
|
// Only allow integer and float literals, other values can be
|
|
// made by combining these
|
|
if isInt {
|
|
var bigint big.Int
|
|
bigint.SetString(l.source[l.startByte:l.currentByte], 10)
|
|
l.addToken(INTEGER, bigint)
|
|
} else {
|
|
float, _ := strconv.ParseFloat(l.source[l.startByte:l.currentByte], 64)
|
|
l.addToken(FLOAT, float)
|
|
}
|
|
}
|
|
|
|
func (l *Lexer) match(expected rune) bool {
|
|
if l.atEnd() {
|
|
return false
|
|
}
|
|
ch, width := utf8.DecodeRuneInString(l.source[l.currentByte:])
|
|
if ch == expected {
|
|
l.currentRune++
|
|
l.currentByte += width
|
|
return true
|
|
} else {
|
|
return false
|
|
}
|
|
}
|
|
|
|
func (l Lexer) peek() rune {
|
|
if l.atEnd() {
|
|
return '\x00'
|
|
}
|
|
ch, _ := utf8.DecodeRuneInString(l.source[l.currentByte:])
|
|
return ch
|
|
}
|
|
|
|
func (l Lexer) peekNext() rune {
|
|
_, width1 := utf8.DecodeRuneInString(l.source[l.currentByte:])
|
|
// "width1 == 0" signifies we reached the end of the string with the first character
|
|
if width1 == 0 || l.currentByte+width1 >= len(l.source) {
|
|
return '\x00'
|
|
}
|
|
ch2, _ := utf8.DecodeRuneInString(l.source[l.currentByte+width1:])
|
|
return ch2
|
|
}
|
|
|
|
func isAlpha(c rune) bool {
|
|
return (c >= 'a' && c <= 'z') ||
|
|
(c >= 'A' && c <= 'Z') ||
|
|
c == '_'
|
|
}
|
|
|
|
func isAlphaNumeric(c rune) bool {
|
|
return isAlpha(c) || isDigit(c)
|
|
}
|
|
|
|
func isDigit(c rune) bool {
|
|
return c >= '0' && c <= '9'
|
|
}
|
|
|
|
func (l *Lexer) advance() rune {
|
|
ch, width := utf8.DecodeRuneInString(l.source[l.currentByte:])
|
|
l.currentRune++
|
|
l.currentByte += width
|
|
return ch
|
|
}
|
|
|
|
func (l *Lexer) str() {
|
|
var sb strings.Builder
|
|
for l.peek() != '"' && !l.atEnd() {
|
|
c := l.peek()
|
|
if c == '\n' {
|
|
l.line++
|
|
} else if c == '\\' {
|
|
l.advance()
|
|
// TODO: add more escape sequences, including \xNN
|
|
switch l.peek() {
|
|
case 'n':
|
|
c = '\n'
|
|
case 'r':
|
|
c = '\r'
|
|
case 't':
|
|
c = '\t'
|
|
case '\\':
|
|
c = '\\'
|
|
case '"':
|
|
c = '"'
|
|
case '\'':
|
|
c = '\''
|
|
case 'e':
|
|
c = '\x1b'
|
|
default:
|
|
util.Error(l.line, fmt.Sprintf("Invalid escape sequence \\%v.", l.peek()))
|
|
return
|
|
}
|
|
}
|
|
sb.WriteRune(c)
|
|
l.advance()
|
|
}
|
|
|
|
if l.atEnd() {
|
|
util.Error(l.line, "Unterminated string.")
|
|
return
|
|
}
|
|
|
|
// Closing ".
|
|
l.advance()
|
|
|
|
value := sb.String()
|
|
l.addToken(STRING, value)
|
|
}
|
|
|
|
// Simple refers to "having no literal" (TODO: rename function)
|
|
func (l *Lexer) addSimpleToken(ttype TokenType) {
|
|
l.addToken(ttype, nil)
|
|
}
|
|
|
|
func (l *Lexer) addToken(ttype TokenType, literal interface{}) {
|
|
text := l.source[l.startByte:l.currentByte]
|
|
l.Tokens = append(l.Tokens, NewToken(ttype, text, literal, l.line))
|
|
}
|