Initial commit: completed simple lexer for pj1

2024-11-25 22:53:29 +01:00 · 2024-11-25 22:53:29 +01:00 · 7f72ba59d5
commit 7f72ba59d5
6 changed files with 522 additions and 0 deletions
--- a/pj1-go/go.mod
+++ b/pj1-go/go.mod
@ -0,0 +1,10 @@
+module git.bonsai.cool/kayprish/pj1/pj1-go
+
+go 1.18
+
+require golang.org/x/tools v0.1.13-0.20220917004541-4d18923f060e
+
+require (
+	golang.org/x/mod v0.12.0 // indirect
+	golang.org/x/sys v0.0.0-20220722155257-8c9f86f7a55f // indirect
+)
--- a/pj1-go/go.sum
+++ b/pj1-go/go.sum
@ -0,0 +1,6 @@
+golang.org/x/mod v0.12.0 h1:rmsUpXtvNzj340zd98LZ4KntptpfRHwpFOHG188oHXc=
+golang.org/x/mod v0.12.0/go.mod h1:iBbtSCu2XBx23ZKBPSOrRkjjQPZFPuis4dIYUhu/chs=
+golang.org/x/sys v0.0.0-20220722155257-8c9f86f7a55f h1:v4INt8xihDGvnrfjMDVXGxw9wrfxYyCjk0KbXjhR55s=
+golang.org/x/sys v0.0.0-20220722155257-8c9f86f7a55f/go.mod h1:oPkhp1MJrh7nUepCBck5+mAzfO9JrbApNNgaTdGDITg=
+golang.org/x/tools v0.1.13-0.20220917004541-4d18923f060e h1:K/LreqAwv7hZaSPyj5LvaiQd2wROouJDabf2r+oBqUw=
+golang.org/x/tools v0.1.13-0.20220917004541-4d18923f060e/go.mod h1:VsjNM1dMo+Ofkp5d7y7fOdQZD8MTXSQ4w3EPk65AvKU=
--- a/pj1-go/lexer.go
+++ b/pj1-go/lexer.go
@ -0,0 +1,360 @@
+package main
+
+import (
+	"fmt"
+	"math/big"
+	"strconv"
+	"strings"
+	"unicode/utf8"
+)
+
+type TokenType int
+
+const (
+	// Single-character tokens.
+	LEFT_PAREN TokenType = iota
+	RIGHT_PAREN
+	LEFT_BRACE
+	RIGHT_BRACE
+	COMMA
+	DOT
+	MINUS
+	PLUS
+	SEMICOLON
+	STAR
+
+	// One or two character tokens.
+	BANG
+	BANG_EQUAL
+	EQUAL
+	EQUAL_EQUAL
+	GREATER
+	GREATER_EQUAL
+	LESS
+	LESS_EQUAL
+	SLASH
+	SLASH_DOT
+	SLASH_UNDERSCORE
+
+	// Literals
+	IDENTIFIER
+	STRING
+	INTEGER
+	FLOAT
+
+	// Keywords.
+	AND
+	CLASS
+	ELSE
+	FALSE
+	FOR
+	FUN
+	IF
+	NIL
+	OR
+	PRINT
+	RETURN
+	SUPER
+	THIS
+	TRUE
+	VAR
+	WHILE
+
+	EOF
+)
+
+//go:generate go run golang.org/x/tools/cmd/stringer -type=TokenType
+
+var keywords = map[string]TokenType{
+	"and":    AND,
+	"class":  CLASS,
+	"else":   ELSE,
+	"false":  FALSE,
+	"for":    FOR,
+	"fun":    FUN,
+	"if":     IF,
+	"nil":    NIL,
+	"or":     OR,
+	"print":  PRINT,
+	"return": RETURN,
+	"super":  SUPER,
+	"this":   THIS,
+	"true":   TRUE,
+	"var":    VAR,
+	"while":  WHILE,
+}
+
+type Token struct {
+	ttype   TokenType
+	lexeme  string
+	literal interface{}
+	line    int
+}
+
+func NewToken(ttype TokenType, lexeme string, literal interface{}, line int) Token {
+	t := Token{ttype, lexeme, literal, line}
+	return t
+}
+
+func (t Token) String() string {
+	return fmt.Sprintf("%v %v %v", t.ttype, t.lexeme, t.literal)
+}
+
+type Lexer struct {
+	source string
+	tokens []Token
+
+	startByte   int
+	currentByte int
+	startRune   int
+	currentRune int
+	line        int
+}
+
+func NewLexer(source string) Lexer {
+	l := Lexer{source, []Token{}, 0, 0, 0, 0, 1}
+	return l
+}
+
+func (l *Lexer) ScanTokens() {
+	for !l.atEnd() {
+		l.startByte = l.currentByte
+		l.startRune = l.currentRune
+		l.scanToken()
+	}
+
+	l.tokens = append(l.tokens, NewToken(EOF, "", nil, l.line))
+}
+
+func (l Lexer) atEnd() bool {
+	return l.currentByte >= len(l.source)
+}
+
+func (l *Lexer) scanToken() {
+	c := l.advance()
+	switch c {
+	case '(':
+		l.addSimpleToken(LEFT_PAREN)
+	case ')':
+		l.addSimpleToken(RIGHT_PAREN)
+	case '{':
+		l.addSimpleToken(LEFT_BRACE)
+	case '}':
+		l.addSimpleToken(RIGHT_BRACE)
+	case ',':
+		l.addSimpleToken(COMMA)
+	case '.':
+		l.addSimpleToken(DOT)
+	case '-':
+		l.addSimpleToken(MINUS)
+	case '+':
+		l.addSimpleToken(PLUS)
+	case ';':
+		l.addSimpleToken(SEMICOLON)
+	case '*':
+		l.addSimpleToken(STAR)
+	case '!':
+		if l.match('=') {
+			l.addSimpleToken(BANG_EQUAL)
+		} else {
+			l.addSimpleToken(BANG)
+		}
+	case '=':
+		if l.match('=') {
+			l.addSimpleToken(EQUAL_EQUAL)
+		} else {
+			l.addSimpleToken(EQUAL)
+		}
+	case '<':
+		if l.match('=') {
+			l.addSimpleToken(LESS_EQUAL)
+		} else {
+			l.addSimpleToken(LESS)
+		}
+	case '>':
+		if l.match('=') {
+			l.addSimpleToken(GREATER_EQUAL)
+		} else {
+			l.addSimpleToken(GREATER)
+		}
+	case '/':
+		if l.match('/') {
+			// A comment goes until the end of the line
+			for l.peek() != '\n' && !l.atEnd() {
+				l.advance()
+			}
+		} else if l.match('.') {
+			l.addSimpleToken(SLASH_DOT)
+		} else if l.match('_') {
+			l.addSimpleToken(SLASH_UNDERSCORE)
+		} else {
+			l.addSimpleToken(SLASH)
+		}
+	case ' ', '\r', '\t':
+	case '\n':
+		l.line++
+	case '"':
+		l.str()
+
+	default:
+		if isDigit(c) {
+			l.number()
+		} else if isAlpha(c) {
+			l.identifier()
+		} else {
+			// TODO: if there are multiple bad characters
+			// coalesce similar errors into one
+			error(l.line, fmt.Sprintf("Unexpected character, %v.", c))
+		}
+	}
+}
+
+func (l *Lexer) identifier() {
+	for isAlphaNumeric(l.peek()) {
+		l.advance()
+	}
+
+	text := l.source[l.startByte:l.currentByte]
+	ttype, ok := keywords[text]
+
+	if !ok {
+		ttype = IDENTIFIER
+	}
+
+	l.addSimpleToken(ttype)
+}
+
+func (l *Lexer) number() {
+	isInt := true
+	for isDigit(l.peek()) {
+		l.advance()
+	}
+
+	if l.peek() == '.' && isDigit(l.peekNext()) {
+		l.advance()
+		isInt = false
+
+		for isDigit(l.peek()) {
+			l.advance()
+		}
+	}
+
+	// Only allow integer and float literals, other values can be
+	// made by combining these
+	if isInt {
+		var bigint big.Int
+		bigint.SetString(l.source[l.startByte:l.currentByte], 10)
+		l.addToken(INTEGER, bigint)
+	} else {
+		float, _ := strconv.ParseFloat(l.source[l.startByte:l.currentByte], 64)
+		l.addToken(FLOAT, float)
+	}
+}
+
+func (l *Lexer) match(expected rune) bool {
+	if l.atEnd() {
+		return false
+	}
+	ch, width := utf8.DecodeRuneInString(l.source[l.currentByte:])
+	if ch == expected {
+		l.currentRune++
+		l.currentByte += width
+		return true
+	} else {
+		return false
+	}
+}
+
+func (l Lexer) peek() rune {
+	if l.atEnd() {
+		return '\x00'
+	}
+	ch, _ := utf8.DecodeRuneInString(l.source[l.currentByte:])
+	return ch
+}
+
+func (l Lexer) peekNext() rune {
+	_, width1 := utf8.DecodeRuneInString(l.source[l.currentByte:])
+	// "width1 == 0" signifies we reached the end of the string with the first character
+	if width1 == 0 || l.currentByte+width1 >= len(l.source) {
+		return '\x00'
+	}
+	ch2, _ := utf8.DecodeRuneInString(l.source[l.currentByte+width1:])
+	return ch2
+}
+
+func isAlpha(c rune) bool {
+	return (c >= 'a' && c <= 'z') ||
+		(c >= 'A' && c <= 'Z') ||
+		c == '_'
+}
+
+func isAlphaNumeric(c rune) bool {
+	return isAlpha(c) || isDigit(c)
+}
+
+func isDigit(c rune) bool {
+	return c >= '0' && c <= '9'
+}
+
+func (l *Lexer) advance() rune {
+	ch, width := utf8.DecodeRuneInString(l.source[l.currentByte:])
+	l.currentRune++
+	l.currentByte += width
+	return ch
+}
+
+func (l *Lexer) str() {
+	var sb strings.Builder
+	for l.peek() != '"' && !l.atEnd() {
+		c := l.peek()
+		if c == '\n' {
+			l.line++
+		} else if c == '\\' {
+			l.advance()
+			// TODO: add more escape sequences, including \xNN
+			switch l.peek() {
+			case 'n':
+				c = '\n'
+			case 'r':
+				c = '\r'
+			case 't':
+				c = '\t'
+			case '\\':
+				c = '\\'
+			case '"':
+				c = '"'
+			case '\'':
+				c = '\''
+			case 'e':
+				c = '\x1b'
+			default:
+				error(l.line, fmt.Sprintf("Invalid escape sequence \\%v.", l.peek()))
+				return
+			}
+		}
+		sb.WriteRune(c)
+		l.advance()
+	}
+
+	if l.atEnd() {
+		error(l.line, "Unterminated string.")
+		return
+	}
+
+	// Closing ".
+	l.advance()
+
+	value := sb.String()
+	l.addToken(STRING, value)
+}
+
+// Simple refers to "having no literal" (TODO: rename function)
+func (l *Lexer) addSimpleToken(ttype TokenType) {
+	l.addToken(ttype, nil)
+}
+
+func (l *Lexer) addToken(ttype TokenType, literal interface{}) {
+	text := l.source[l.startByte:l.currentByte]
+	l.tokens = append(l.tokens, NewToken(ttype, text, literal, l.line))
+}
--- a/pj1-go/pj1.go
+++ b/pj1-go/pj1.go
@ -0,0 +1,74 @@
+package main
+
+import (
+	"bufio"
+	"fmt"
+	"io/ioutil"
+	"os"
+)
+
+var (
+	hadError bool = false
+)
+
+func main() {
+	if len(os.Args) > 2 {
+		fmt.Println("Usage: pj1-go [script]")
+		os.Exit(64)
+	} else if len(os.Args) == 2 {
+		runFile(os.Args[0])
+	} else {
+		runPrompt()
+	}
+}
+
+func runFile(path string) {
+	bytes, err := ioutil.ReadFile(path)
+	if err != nil {
+		fmt.Println(err)
+		return
+	}
+	run(string(bytes[:]))
+
+	if hadError {
+		os.Exit(65)
+	}
+}
+
+func runPrompt() {
+	scanner := bufio.NewScanner(os.Stdin)
+	for {
+		fmt.Print("> ")
+		if !scanner.Scan() {
+			break
+		}
+		line := scanner.Text()
+		fmt.Println(line)
+		run(line)
+
+		hadError = false
+	}
+	if err := scanner.Err(); err != nil {
+		fmt.Fprintln(os.Stderr, "reading standard input:", err)
+	}
+}
+
+func run(source string) {
+	lexer := NewLexer(source)
+	lexer.ScanTokens()
+	var tokens []Token = lexer.tokens
+
+	for _, token := range tokens {
+		fmt.Println(token)
+	}
+}
+
+// TODO: might have to rename
+func error(line int, msg string) {
+	report(line, "", msg)
+	hadError = true
+}
+
+func report(line int, where string, msg string) {
+	fmt.Fprintln(os.Stderr, "[line "+fmt.Sprint(line)+"] Error"+where+": "+msg)
+}
--- a/pj1-go/tokentype_string.go
+++ b/pj1-go/tokentype_string.go
@ -0,0 +1,64 @@
+// Code generated by "stringer -type=TokenType"; DO NOT EDIT.
+
+package main
+
+import "strconv"
+
+func _() {
+	// An "invalid array index" compiler error signifies that the constant values have changed.
+	// Re-run the stringer command to generate them again.
+	var x [1]struct{}
+	_ = x[LEFT_PAREN-0]
+	_ = x[RIGHT_PAREN-1]
+	_ = x[LEFT_BRACE-2]
+	_ = x[RIGHT_BRACE-3]
+	_ = x[COMMA-4]
+	_ = x[DOT-5]
+	_ = x[MINUS-6]
+	_ = x[PLUS-7]
+	_ = x[SEMICOLON-8]
+	_ = x[STAR-9]
+	_ = x[BANG-10]
+	_ = x[BANG_EQUAL-11]
+	_ = x[EQUAL-12]
+	_ = x[EQUAL_EQUAL-13]
+	_ = x[GREATER-14]
+	_ = x[GREATER_EQUAL-15]
+	_ = x[LESS-16]
+	_ = x[LESS_EQUAL-17]
+	_ = x[SLASH-18]
+	_ = x[SLASH_DOT-19]
+	_ = x[SLASH_UNDERSCORE-20]
+	_ = x[IDENTIFIER-21]
+	_ = x[STRING-22]
+	_ = x[INTEGER-23]
+	_ = x[FLOAT-24]
+	_ = x[AND-25]
+	_ = x[CLASS-26]
+	_ = x[ELSE-27]
+	_ = x[FALSE-28]
+	_ = x[FOR-29]
+	_ = x[FUN-30]
+	_ = x[IF-31]
+	_ = x[NIL-32]
+	_ = x[OR-33]
+	_ = x[PRINT-34]
+	_ = x[RETURN-35]
+	_ = x[SUPER-36]
+	_ = x[THIS-37]
+	_ = x[TRUE-38]
+	_ = x[VAR-39]
+	_ = x[WHILE-40]
+	_ = x[EOF-41]
+}
+
+const _TokenType_name = "LEFT_PARENRIGHT_PARENLEFT_BRACERIGHT_BRACECOMMADOTMINUSPLUSSEMICOLONSTARBANGBANG_EQUALEQUALEQUAL_EQUALGREATERGREATER_EQUALLESSLESS_EQUALSLASHSLASH_DOTSLASH_UNDERSCOREIDENTIFIERSTRINGINTEGERFLOATANDCLASSELSEFALSEFORFUNIFNILORPRINTRETURNSUPERTHISTRUEVARWHILEEOF"
+
+var _TokenType_index = [...]uint16{0, 10, 21, 31, 42, 47, 50, 55, 59, 68, 72, 76, 86, 91, 102, 109, 122, 126, 136, 141, 150, 166, 176, 182, 189, 194, 197, 202, 206, 211, 214, 217, 219, 222, 224, 229, 235, 240, 244, 248, 251, 256, 259}
+
+func (i TokenType) String() string {
+	if i < 0 || i >= TokenType(len(_TokenType_index)-1) {
+		return "TokenType(" + strconv.FormatInt(int64(i), 10) + ")"
+	}
+	return _TokenType_name[_TokenType_index[i]:_TokenType_index[i+1]]
+}
--- a/pj1-go/tools.go
+++ b/pj1-go/tools.go
@ -0,0 +1,8 @@
+//go:build tools
+// +build tools
+
+package tools
+
+import (
+	_ "golang.org/x/tools/cmd/stringer"
+)