pj1/pj1-go/lexer.go
2024-08-05 12:09:33 +02:00

383 lines
6.5 KiB
Go

package main
import (
"fmt"
"math/big"
"strconv"
"strings"
"unicode/utf8"
)
type TokenType int
const (
// Single-character tokens.
LEFT_PAREN TokenType = iota
RIGHT_PAREN
LEFT_BRACE
RIGHT_BRACE
COMMA
DOT
MINUS
PLUS
SEMICOLON
STAR
// One or two character tokens.
BANG
BANG_EQUAL
EQUAL
EQUAL_EQUAL
GREATER
GREATER_EQUAL
LESS
LESS_EQUAL
SLASH
SLASH_DOT
SLASH_UNDERSCORE
// Literals
IDENTIFIER
STRING
INTEGER
FLOAT
// Keywords.
AND
CLASS
ELSE
FALSE
FOR
FUN
IF
NIL
OR
PRINT
RETURN
SUPER
THIS
TRUE
VAR
WHILE
EOF
)
//go:generate go run golang.org/x/tools/cmd/stringer -type=TokenType
var keywords = map[string]TokenType{
"and": AND,
"class": CLASS,
"else": ELSE,
"false": FALSE,
"for": FOR,
"fun": FUN,
"if": IF,
"nil": NIL,
"or": OR,
"print": PRINT,
"return": RETURN,
"super": SUPER,
"this": THIS,
"true": TRUE,
"var": VAR,
"while": WHILE,
}
type Token struct {
ttype TokenType
lexeme string
literal interface{}
line int
}
func NewToken(ttype TokenType, lexeme string, literal interface{}, line int) Token {
t := Token{ttype, lexeme, literal, line}
return t
}
func (t Token) String() string {
return fmt.Sprintf("%v %v %v", t.ttype, t.lexeme, t.literal)
}
type Lexer struct {
source string
tokens []Token
startByte int
currentByte int
startRune int
currentRune int
line int
}
func NewLexer(source string) Lexer {
l := Lexer{source, []Token{}, 0, 0, 0, 0, 1}
return l
}
func (l *Lexer) ScanTokens() {
for !l.atEnd() {
l.startByte = l.currentByte
l.startRune = l.currentRune
l.scanToken()
}
l.tokens = append(l.tokens, NewToken(EOF, "", nil, l.line))
}
func (l Lexer) atEnd() bool {
return l.currentByte >= len(l.source)
}
func (l *Lexer) scanToken() {
c := l.advance()
switch c {
case '(':
l.addSimpleToken(LEFT_PAREN)
case ')':
l.addSimpleToken(RIGHT_PAREN)
case '{':
l.addSimpleToken(LEFT_BRACE)
case '}':
l.addSimpleToken(RIGHT_BRACE)
case ',':
l.addSimpleToken(COMMA)
case '.':
l.addSimpleToken(DOT)
case '-':
l.addSimpleToken(MINUS)
case '+':
l.addSimpleToken(PLUS)
case ';':
l.addSimpleToken(SEMICOLON)
case '*':
l.addSimpleToken(STAR)
case '!':
if l.match('=') {
l.addSimpleToken(BANG_EQUAL)
} else {
l.addSimpleToken(BANG)
}
case '=':
if l.match('=') {
l.addSimpleToken(EQUAL_EQUAL)
} else {
l.addSimpleToken(EQUAL)
}
case '<':
if l.match('=') {
l.addSimpleToken(LESS_EQUAL)
} else {
l.addSimpleToken(LESS)
}
case '>':
if l.match('=') {
l.addSimpleToken(GREATER_EQUAL)
} else {
l.addSimpleToken(GREATER)
}
case '/':
if l.match('/') {
// A comment goes until the end of the line
for l.peek() != '\n' && !l.atEnd() {
l.advance()
}
} else if l.match('*') {
l.advance()
l.advance() // swallow the "/*"
nesting := 1
// A comment goes until we reach */, however,
// the comments can also nest
for {
if l.peek() == '/' && l.peekNext() == '*' {
nesting++
l.advance()
l.advance()
} else if l.peek() == '*' && l.peekNext() == '/' {
nesting--
l.advance()
l.advance()
} else {
if l.peek() == '\n' {
l.line++
}
l.advance()
}
}
} else if l.match('.') {
l.addSimpleToken(SLASH_DOT)
} else if l.match('_') {
l.addSimpleToken(SLASH_UNDERSCORE)
} else {
l.addSimpleToken(SLASH)
}
case ' ', '\r', '\t':
case '\n':
l.line++
case '"':
l.str()
default:
if isDigit(c) {
l.number()
} else if isAlpha(c) {
l.identifier()
} else {
// TODO: if there are multiple bad characters
// coalesce similar errors into one
error(l.line, fmt.Sprintf("Unexpected character, %v.", c))
}
}
}
func (l *Lexer) identifier() {
for isAlphaNumeric(l.peek()) {
l.advance()
}
text := l.source[l.startByte:l.currentByte]
ttype, ok := keywords[text]
if !ok {
ttype = IDENTIFIER
}
l.addSimpleToken(ttype)
}
func (l *Lexer) number() {
isInt := true
for isDigit(l.peek()) {
l.advance()
}
if l.peek() == '.' && isDigit(l.peekNext()) {
l.advance()
isInt = false
for isDigit(l.peek()) {
l.advance()
}
}
// Only allow integer and float literals, other values can be
// made by combining these
if isInt {
var bigint big.Int
bigint.SetString(l.source[l.startByte:l.currentByte], 10)
l.addToken(INTEGER, bigint)
} else {
float, _ := strconv.ParseFloat(l.source[l.startByte:l.currentByte], 64)
l.addToken(FLOAT, float)
}
}
func (l *Lexer) match(expected rune) bool {
if l.atEnd() {
return false
}
ch, width := utf8.DecodeRuneInString(l.source[l.currentByte:])
if ch == expected {
l.currentRune++
l.currentByte += width
return true
} else {
return false
}
}
func (l Lexer) peek() rune {
if l.atEnd() {
return '\x00'
}
ch, _ := utf8.DecodeRuneInString(l.source[l.currentByte:])
return ch
}
func (l Lexer) peekNext() rune {
_, width1 := utf8.DecodeRuneInString(l.source[l.currentByte:])
// "width1 == 0" signifies we reached the end of the string with the first character
if width1 == 0 || l.currentByte+width1 >= len(l.source) {
return '\x00'
}
ch2, _ := utf8.DecodeRuneInString(l.source[l.currentByte+width1:])
return ch2
}
func isAlpha(c rune) bool {
return (c >= 'a' && c <= 'z') ||
(c >= 'A' && c <= 'Z') ||
c == '_'
}
func isAlphaNumeric(c rune) bool {
return isAlpha(c) || isDigit(c)
}
func isDigit(c rune) bool {
return c >= '0' && c <= '9'
}
func (l *Lexer) advance() rune {
ch, width := utf8.DecodeRuneInString(l.source[l.currentByte:])
l.currentRune++
l.currentByte += width
return ch
}
func (l *Lexer) str() {
var sb strings.Builder
for l.peek() != '"' && !l.atEnd() {
c := l.peek()
if c == '\n' {
l.line++
} else if c == '\\' {
l.advance()
// TODO: add more escape sequences, including \xNN
switch l.peek() {
case 'n':
c = '\n'
case 'r':
c = '\r'
case 't':
c = '\t'
case '\\':
c = '\\'
case '"':
c = '"'
case '\'':
c = '\''
case 'e':
c = '\x1b'
default:
error(l.line, fmt.Sprintf("Invalid escape sequence \\%v.", l.peek()))
return
}
}
sb.WriteRune(c)
l.advance()
}
if l.atEnd() {
error(l.line, "Unterminated string.")
return
}
// Closing ".
l.advance()
value := sb.String()
l.addToken(STRING, value)
}
// Simple refers to "having no literal" (TODO: rename function)
func (l *Lexer) addSimpleToken(ttype TokenType) {
l.addToken(ttype, nil)
}
func (l *Lexer) addToken(ttype TokenType, literal interface{}) {
text := l.source[l.startByte:l.currentByte]
l.tokens = append(l.tokens, NewToken(ttype, text, literal, l.line))
}