pj1/pj1-go/lexer.go

package main

import (
	"fmt"
	"math/big"
	"strconv"
	"strings"
	"unicode/utf8"

	"git.bonsai.cool/kayprish/pj1/pj1-go/util"
)

type TokenType int

const (
	// Single-character tokens.
	LEFT_PAREN TokenType = iota
	RIGHT_PAREN
	LEFT_BRACE
	RIGHT_BRACE
	COMMA
	DOT
	MINUS
	PLUS
	SEMICOLON

	// One or two character tokens.
	STAR
	STAR_STAR
	BANG
	BANG_EQUAL
	EQUAL
	EQUAL_EQUAL
	GREATER
	GREATER_EQUAL
	LESS
	LESS_EQUAL
	SLASH
	SLASH_DOT
	SLASH_UNDERSCORE
	SLASH_MODULO
	MODULO

	// Literals
	IDENTIFIER
	STRING
	INTEGER
	FLOAT

	// Keywords.
	AND
	CLASS
	ELSE
	FALSE
	FOR
	FUN
	IF
	NIL
	OR
	PRINT
	RETURN
	SUPER
	THIS
	TRUE
	VAR
	WHILE

	EOF
)

//go:generate go run golang.org/x/tools/cmd/stringer -type=TokenType

var keywords = map[string]TokenType{
	"and":    AND,
	"class":  CLASS,
	"else":   ELSE,
	"false":  FALSE,
	"for":    FOR,
	"fun":    FUN,
	"if":     IF,
	"nil":    NIL,
	"or":     OR,
	"print":  PRINT,
	"return": RETURN,
	"super":  SUPER,
	"this":   THIS,
	"true":   TRUE,
	"var":    VAR,
	"while":  WHILE,
}

type Token struct {
	ttype   TokenType
	Lexeme  string
	literal interface{}
	line    int
}

func NewToken(ttype TokenType, lexeme string, literal interface{}, line int) Token {
	t := Token{ttype, lexeme, literal, line}
	return t
}

func (t Token) String() string {
	return fmt.Sprintf("%v %v %v", t.ttype, t.Lexeme, t.literal)
}

type Lexer struct {
	source string
	Tokens []Token

	startByte   int
	currentByte int
	startRune   int
	currentRune int
	line        int
}

func NewLexer(source string) Lexer {
	l := Lexer{source, []Token{}, 0, 0, 0, 0, 1}
	return l
}

func (l *Lexer) ScanTokens() {
	for !l.atEnd() {
		l.startByte = l.currentByte
		l.startRune = l.currentRune
		l.scanToken()
	}

	l.Tokens = append(l.Tokens, NewToken(EOF, "", nil, l.line))
}

func (l Lexer) atEnd() bool {
	return l.currentByte >= len(l.source)
}

func (l *Lexer) scanToken() {
	c := l.advance()
	switch c {
	case '(':
		l.addSimpleToken(LEFT_PAREN)
	case ')':
		l.addSimpleToken(RIGHT_PAREN)
	case '{':
		l.addSimpleToken(LEFT_BRACE)
	case '}':
		l.addSimpleToken(RIGHT_BRACE)
	case ',':
		l.addSimpleToken(COMMA)
	case '.':
		l.addSimpleToken(DOT)
	case '-':
		l.addSimpleToken(MINUS)
	case '+':
		l.addSimpleToken(PLUS)
	case ';':
		l.addSimpleToken(SEMICOLON)
	case '*':
		if l.match('*') {
			l.addSimpleToken(STAR_STAR)
		} else {
			l.addSimpleToken(STAR)
		}
		l.addSimpleToken(STAR)
	case '!':
		if l.match('=') {
			l.addSimpleToken(BANG_EQUAL)
		} else {
			l.addSimpleToken(BANG)
		}
	case '=':
		if l.match('=') {
			l.addSimpleToken(EQUAL_EQUAL)
		} else {
			l.addSimpleToken(EQUAL)
		}
	case '<':
		if l.match('=') {
			l.addSimpleToken(LESS_EQUAL)
		} else {
			l.addSimpleToken(LESS)
		}
	case '>':
		if l.match('=') {
			l.addSimpleToken(GREATER_EQUAL)
		} else {
			l.addSimpleToken(GREATER)
		}
	case '%':
		l.addSimpleToken(MODULO)
	case '/':
		if l.match('/') {
			// A comment goes until the end of the line
			for l.peek() != '\n' && !l.atEnd() {
				l.advance()
			}
		} else if l.match('*') {
			l.advance()
			l.advance() // swallow the "/*"
			nesting := 1
			// A comment goes until we reach */, however,
			// the comments can also nest
			for {
				if l.peek() == '/' && l.peekNext() == '*' {
					nesting++
					l.advance()
					l.advance()
				} else if l.peek() == '*' && l.peekNext() == '/' {
					nesting--
					l.advance()
					l.advance()
				} else {
					if l.peek() == '\n' {
						l.line++
					}
					l.advance()
				}
			}
		} else if l.match('.') {
			l.addSimpleToken(SLASH_DOT)
		} else if l.match('_') {
			l.addSimpleToken(SLASH_UNDERSCORE)
		} else if l.match('%') {
			l.addSimpleToken(SLASH_MODULO)
		} else {
			l.addSimpleToken(SLASH)
		}
	case ' ', '\r', '\t':
	case '\n':
		l.line++
	case '"':
		l.str()

	default:
		if isDigit(c) {
			l.number()
		} else if isAlpha(c) {
			l.identifier()
		} else {
			// TODO: if there are multiple bad characters
			// coalesce similar errors into one
			util.Error(l.line, fmt.Sprintf("Unexpected character, %v.", c))
		}
	}
}

func (l *Lexer) identifier() {
	for isAlphaNumeric(l.peek()) {
		l.advance()
	}

	text := l.source[l.startByte:l.currentByte]
	ttype, ok := keywords[text]

	if !ok {
		ttype = IDENTIFIER
	}

	l.addSimpleToken(ttype)
}

func (l *Lexer) number() {
	isInt := true
	for isDigit(l.peek()) {
		l.advance()
	}

	if l.peek() == '.' && isDigit(l.peekNext()) {
		l.advance()
		isInt = false

		for isDigit(l.peek()) {
			l.advance()
		}
	}

	// Only allow integer and float literals, other values can be
	// made by combining these
	if isInt {
		var bigint big.Int
		bigint.SetString(l.source[l.startByte:l.currentByte], 10)
		l.addToken(INTEGER, bigint)
	} else {
		float, _ := strconv.ParseFloat(l.source[l.startByte:l.currentByte], 64)
		l.addToken(FLOAT, float)
	}
}

func (l *Lexer) match(expected rune) bool {
	if l.atEnd() {
		return false
	}
	ch, width := utf8.DecodeRuneInString(l.source[l.currentByte:])
	if ch == expected {
		l.currentRune++
		l.currentByte += width
		return true
	} else {
		return false
	}
}

func (l Lexer) peek() rune {
	if l.atEnd() {
		return '\x00'
	}
	ch, _ := utf8.DecodeRuneInString(l.source[l.currentByte:])
	return ch
}

func (l Lexer) peekNext() rune {
	_, width1 := utf8.DecodeRuneInString(l.source[l.currentByte:])
	// "width1 == 0" signifies we reached the end of the string with the first character
	if width1 == 0 || l.currentByte+width1 >= len(l.source) {
		return '\x00'
	}
	ch2, _ := utf8.DecodeRuneInString(l.source[l.currentByte+width1:])
	return ch2
}

func isAlpha(c rune) bool {
	return (c >= 'a' && c <= 'z') ||
		(c >= 'A' && c <= 'Z') ||
		c == '_'
}

func isAlphaNumeric(c rune) bool {
	return isAlpha(c) || isDigit(c)
}

func isDigit(c rune) bool {
	return c >= '0' && c <= '9'
}

func (l *Lexer) advance() rune {
	ch, width := utf8.DecodeRuneInString(l.source[l.currentByte:])
	l.currentRune++
	l.currentByte += width
	return ch
}

func (l *Lexer) str() {
	var sb strings.Builder
	for l.peek() != '"' && !l.atEnd() {
		c := l.peek()
		if c == '\n' {
			l.line++
		} else if c == '\\' {
			l.advance()
			// TODO: add more escape sequences, including \xNN
			switch l.peek() {
			case 'n':
				c = '\n'
			case 'r':
				c = '\r'
			case 't':
				c = '\t'
			case '\\':
				c = '\\'
			case '"':
				c = '"'
			case '\'':
				c = '\''
			case 'e':
				c = '\x1b'
			default:
				util.Error(l.line, fmt.Sprintf("Invalid escape sequence \\%v.", l.peek()))
				return
			}
		}
		sb.WriteRune(c)
		l.advance()
	}

	if l.atEnd() {
		util.Error(l.line, "Unterminated string.")
		return
	}

	// Closing ".
	l.advance()

	value := sb.String()
	l.addToken(STRING, value)
}

// Simple refers to "having no literal" (TODO: rename function)
func (l *Lexer) addSimpleToken(ttype TokenType) {
	l.addToken(ttype, nil)
}

func (l *Lexer) addToken(ttype TokenType, literal interface{}) {
	text := l.source[l.startByte:l.currentByte]
	l.Tokens = append(l.Tokens, NewToken(ttype, text, literal, l.line))
}