Initial commit: completed simple lexer for pj1

2024-11-25 22:53:29 +01:00 · 2024-11-25 22:53:29 +01:00 · 7f72ba59d5
commit 7f72ba59d5
6 changed files with 522 additions and 0 deletions
--- a/pj1-go/go.mod
+++ b/pj1-go/go.mod
@ -0,0 +1,10 @@
 module git.bonsai.cool/kayprish/pj1/pj1-go
 go 1.18
 require golang.org/x/tools v0.1.13-0.20220917004541-4d18923f060e
 require (
 	golang.org/x/mod v0.12.0 // indirect
 	golang.org/x/sys v0.0.0-20220722155257-8c9f86f7a55f // indirect
 )
--- a/pj1-go/go.sum
+++ b/pj1-go/go.sum
@ -0,0 +1,6 @@
 golang.org/x/mod v0.12.0 h1:rmsUpXtvNzj340zd98LZ4KntptpfRHwpFOHG188oHXc=
 golang.org/x/mod v0.12.0/go.mod h1:iBbtSCu2XBx23ZKBPSOrRkjjQPZFPuis4dIYUhu/chs=
 golang.org/x/sys v0.0.0-20220722155257-8c9f86f7a55f h1:v4INt8xihDGvnrfjMDVXGxw9wrfxYyCjk0KbXjhR55s=
 golang.org/x/sys v0.0.0-20220722155257-8c9f86f7a55f/go.mod h1:oPkhp1MJrh7nUepCBck5+mAzfO9JrbApNNgaTdGDITg=
 golang.org/x/tools v0.1.13-0.20220917004541-4d18923f060e h1:K/LreqAwv7hZaSPyj5LvaiQd2wROouJDabf2r+oBqUw=
 golang.org/x/tools v0.1.13-0.20220917004541-4d18923f060e/go.mod h1:VsjNM1dMo+Ofkp5d7y7fOdQZD8MTXSQ4w3EPk65AvKU=
--- a/pj1-go/lexer.go
+++ b/pj1-go/lexer.go
@ -0,0 +1,360 @@
 package main
 import (
 	"fmt"
 	"math/big"
 	"strconv"
 	"strings"
 	"unicode/utf8"
 )
 type TokenType int
 const (
 	// Single-character tokens.
 	LEFT_PAREN TokenType = iota
 	RIGHT_PAREN
 	LEFT_BRACE
 	RIGHT_BRACE
 	COMMA
 	DOT
 	MINUS
 	PLUS
 	SEMICOLON
 	STAR
 	// One or two character tokens.
 	BANG
 	BANG_EQUAL
 	EQUAL
 	EQUAL_EQUAL
 	GREATER
 	GREATER_EQUAL
 	LESS
 	LESS_EQUAL
 	SLASH
 	SLASH_DOT
 	SLASH_UNDERSCORE
 	// Literals
 	IDENTIFIER
 	STRING
 	INTEGER
 	FLOAT
 	// Keywords.
 	AND
 	CLASS
 	ELSE
 	FALSE
 	FOR
 	FUN
 	IF
 	NIL
 	OR
 	PRINT
 	RETURN
 	SUPER
 	THIS
 	TRUE
 	VAR
 	WHILE
 	EOF
 )
 //go:generate go run golang.org/x/tools/cmd/stringer -type=TokenType
 var keywords = map[string]TokenType{
 	"and":    AND,
 	"class":  CLASS,
 	"else":   ELSE,
 	"false":  FALSE,
 	"for":    FOR,
 	"fun":    FUN,
 	"if":     IF,
 	"nil":    NIL,
 	"or":     OR,
 	"print":  PRINT,
 	"return": RETURN,
 	"super":  SUPER,
 	"this":   THIS,
 	"true":   TRUE,
 	"var":    VAR,
 	"while":  WHILE,
 }
 type Token struct {
 	ttype   TokenType
 	lexeme  string
 	literal interface{}
 	line    int
 }
 func NewToken(ttype TokenType, lexeme string, literal interface{}, line int) Token {
 	t := Token{ttype, lexeme, literal, line}
 	return t
 }
 func (t Token) String() string {
 	return fmt.Sprintf("%v %v %v", t.ttype, t.lexeme, t.literal)
 }
 type Lexer struct {
 	source string
 	tokens []Token
 	startByte   int
 	currentByte int
 	startRune   int
 	currentRune int
 	line        int
 }
 func NewLexer(source string) Lexer {
 	l := Lexer{source, []Token{}, 0, 0, 0, 0, 1}
 	return l
 }
 func (l *Lexer) ScanTokens() {
 	for !l.atEnd() {
 		l.startByte = l.currentByte
 		l.startRune = l.currentRune
 		l.scanToken()
 	}
 	l.tokens = append(l.tokens, NewToken(EOF, "", nil, l.line))
 }
 func (l Lexer) atEnd() bool {
 	return l.currentByte >= len(l.source)
 }
 func (l *Lexer) scanToken() {
 	c := l.advance()
 	switch c {
 	case '(':
 		l.addSimpleToken(LEFT_PAREN)
 	case ')':
 		l.addSimpleToken(RIGHT_PAREN)
 	case '{':
 		l.addSimpleToken(LEFT_BRACE)
 	case '}':
 		l.addSimpleToken(RIGHT_BRACE)
 	case ',':
 		l.addSimpleToken(COMMA)
 	case '.':
 		l.addSimpleToken(DOT)
 	case '-':
 		l.addSimpleToken(MINUS)
 	case '+':
 		l.addSimpleToken(PLUS)
 	case ';':
 		l.addSimpleToken(SEMICOLON)
 	case '*':
 		l.addSimpleToken(STAR)
 	case '!':
 		if l.match('=') {
 			l.addSimpleToken(BANG_EQUAL)
 		} else {
 			l.addSimpleToken(BANG)
 		}
 	case '=':
 		if l.match('=') {
 			l.addSimpleToken(EQUAL_EQUAL)
 		} else {
 			l.addSimpleToken(EQUAL)
 		}
 	case '<':
 		if l.match('=') {
 			l.addSimpleToken(LESS_EQUAL)
 		} else {
 			l.addSimpleToken(LESS)
 		}
 	case '>':
 		if l.match('=') {
 			l.addSimpleToken(GREATER_EQUAL)
 		} else {
 			l.addSimpleToken(GREATER)
 		}
 	case '/':
 		if l.match('/') {
 			// A comment goes until the end of the line
 			for l.peek() != '\n' && !l.atEnd() {
 				l.advance()
 			}
 		} else if l.match('.') {
 			l.addSimpleToken(SLASH_DOT)
 		} else if l.match('_') {
 			l.addSimpleToken(SLASH_UNDERSCORE)
 		} else {
 			l.addSimpleToken(SLASH)
 		}
 	case ' ', '\r', '\t':
 	case '\n':
 		l.line++
 	case '"':
 		l.str()
 	default:
 		if isDigit(c) {
 			l.number()
 		} else if isAlpha(c) {
 			l.identifier()
 		} else {
 			// TODO: if there are multiple bad characters
 			// coalesce similar errors into one
 			error(l.line, fmt.Sprintf("Unexpected character, %v.", c))
 		}
 	}
 }
 func (l *Lexer) identifier() {
 	for isAlphaNumeric(l.peek()) {
 		l.advance()
 	}
 	text := l.source[l.startByte:l.currentByte]
 	ttype, ok := keywords[text]
 	if !ok {
 		ttype = IDENTIFIER
 	}
 	l.addSimpleToken(ttype)
 }
 func (l *Lexer) number() {
 	isInt := true
 	for isDigit(l.peek()) {
 		l.advance()
 	}
 	if l.peek() == '.' && isDigit(l.peekNext()) {
 		l.advance()
 		isInt = false
 		for isDigit(l.peek()) {
 			l.advance()
 		}
 	}
 	// Only allow integer and float literals, other values can be
 	// made by combining these
 	if isInt {
 		var bigint big.Int
 		bigint.SetString(l.source[l.startByte:l.currentByte], 10)
 		l.addToken(INTEGER, bigint)
 	} else {
 		float, _ := strconv.ParseFloat(l.source[l.startByte:l.currentByte], 64)
 		l.addToken(FLOAT, float)
 	}
 }
 func (l *Lexer) match(expected rune) bool {
 	if l.atEnd() {
 		return false
 	}
 	ch, width := utf8.DecodeRuneInString(l.source[l.currentByte:])
 	if ch == expected {
 		l.currentRune++
 		l.currentByte += width
 		return true
 	} else {
 		return false
 	}
 }
 func (l Lexer) peek() rune {
 	if l.atEnd() {
 		return '\x00'
 	}
 	ch, _ := utf8.DecodeRuneInString(l.source[l.currentByte:])
 	return ch
 }
 func (l Lexer) peekNext() rune {
 	_, width1 := utf8.DecodeRuneInString(l.source[l.currentByte:])
 	// "width1 == 0" signifies we reached the end of the string with the first character
 	if width1 == 0 || l.currentByte+width1 >= len(l.source) {
 		return '\x00'
 	}
 	ch2, _ := utf8.DecodeRuneInString(l.source[l.currentByte+width1:])
 	return ch2
 }
 func isAlpha(c rune) bool {
 	return (c >= 'a' && c <= 'z') ||
 		(c >= 'A' && c <= 'Z') ||
 		c == '_'
 }
 func isAlphaNumeric(c rune) bool {
 	return isAlpha(c) || isDigit(c)
 }
 func isDigit(c rune) bool {
 	return c >= '0' && c <= '9'
 }
 func (l *Lexer) advance() rune {
 	ch, width := utf8.DecodeRuneInString(l.source[l.currentByte:])
 	l.currentRune++
 	l.currentByte += width
 	return ch
 }
 func (l *Lexer) str() {
 	var sb strings.Builder
 	for l.peek() != '"' && !l.atEnd() {
 		c := l.peek()
 		if c == '\n' {
 			l.line++
 		} else if c == '\\' {
 			l.advance()
 			// TODO: add more escape sequences, including \xNN
 			switch l.peek() {
 			case 'n':
 				c = '\n'
 			case 'r':
 				c = '\r'
 			case 't':
 				c = '\t'
 			case '\\':
 				c = '\\'
 			case '"':
 				c = '"'
 			case '\'':
 				c = '\''
 			case 'e':
 				c = '\x1b'
 			default:
 				error(l.line, fmt.Sprintf("Invalid escape sequence \\%v.", l.peek()))
 				return
 			}
 		}
 		sb.WriteRune(c)
 		l.advance()
 	}
 	if l.atEnd() {
 		error(l.line, "Unterminated string.")
 		return
 	}
 	// Closing ".
 	l.advance()
 	value := sb.String()
 	l.addToken(STRING, value)
 }
 // Simple refers to "having no literal" (TODO: rename function)
 func (l *Lexer) addSimpleToken(ttype TokenType) {
 	l.addToken(ttype, nil)
 }
 func (l *Lexer) addToken(ttype TokenType, literal interface{}) {
 	text := l.source[l.startByte:l.currentByte]
 	l.tokens = append(l.tokens, NewToken(ttype, text, literal, l.line))
 }
--- a/pj1-go/pj1.go
+++ b/pj1-go/pj1.go
@ -0,0 +1,74 @@
 package main
 import (
 	"bufio"
 	"fmt"
 	"io/ioutil"
 	"os"
 )
 var (
 	hadError bool = false
 )
 func main() {
 	if len(os.Args) > 2 {
 		fmt.Println("Usage: pj1-go [script]")
 		os.Exit(64)
 	} else if len(os.Args) == 2 {
 		runFile(os.Args[0])
 	} else {
 		runPrompt()
 	}
 }
 func runFile(path string) {
 	bytes, err := ioutil.ReadFile(path)
 	if err != nil {
 		fmt.Println(err)
 		return
 	}
 	run(string(bytes[:]))
 	if hadError {
 		os.Exit(65)
 	}
 }
 func runPrompt() {
 	scanner := bufio.NewScanner(os.Stdin)
 	for {
 		fmt.Print("> ")
 		if !scanner.Scan() {
 			break
 		}
 		line := scanner.Text()
 		fmt.Println(line)
 		run(line)
 		hadError = false
 	}
 	if err := scanner.Err(); err != nil {
 		fmt.Fprintln(os.Stderr, "reading standard input:", err)
 	}
 }
 func run(source string) {
 	lexer := NewLexer(source)
 	lexer.ScanTokens()
 	var tokens []Token = lexer.tokens
 	for _, token := range tokens {
 		fmt.Println(token)
 	}
 }
 // TODO: might have to rename
 func error(line int, msg string) {
 	report(line, "", msg)
 	hadError = true
 }
 func report(line int, where string, msg string) {
 	fmt.Fprintln(os.Stderr, "[line "+fmt.Sprint(line)+"] Error"+where+": "+msg)
 }
--- a/pj1-go/tokentype_string.go
+++ b/pj1-go/tokentype_string.go
@ -0,0 +1,64 @@
 // Code generated by "stringer -type=TokenType"; DO NOT EDIT.
 package main
 import "strconv"
 func _() {
 	// An "invalid array index" compiler error signifies that the constant values have changed.
 	// Re-run the stringer command to generate them again.
 	var x [1]struct{}
 	_ = x[LEFT_PAREN-0]
 	_ = x[RIGHT_PAREN-1]
 	_ = x[LEFT_BRACE-2]
 	_ = x[RIGHT_BRACE-3]
 	_ = x[COMMA-4]
 	_ = x[DOT-5]
 	_ = x[MINUS-6]
 	_ = x[PLUS-7]
 	_ = x[SEMICOLON-8]
 	_ = x[STAR-9]
 	_ = x[BANG-10]
 	_ = x[BANG_EQUAL-11]
 	_ = x[EQUAL-12]
 	_ = x[EQUAL_EQUAL-13]
 	_ = x[GREATER-14]
 	_ = x[GREATER_EQUAL-15]
 	_ = x[LESS-16]
 	_ = x[LESS_EQUAL-17]
 	_ = x[SLASH-18]
 	_ = x[SLASH_DOT-19]
 	_ = x[SLASH_UNDERSCORE-20]
 	_ = x[IDENTIFIER-21]
 	_ = x[STRING-22]
 	_ = x[INTEGER-23]
 	_ = x[FLOAT-24]
 	_ = x[AND-25]
 	_ = x[CLASS-26]
 	_ = x[ELSE-27]
 	_ = x[FALSE-28]
 	_ = x[FOR-29]
 	_ = x[FUN-30]
 	_ = x[IF-31]
 	_ = x[NIL-32]
 	_ = x[OR-33]
 	_ = x[PRINT-34]
 	_ = x[RETURN-35]
 	_ = x[SUPER-36]
 	_ = x[THIS-37]
 	_ = x[TRUE-38]
 	_ = x[VAR-39]
 	_ = x[WHILE-40]
 	_ = x[EOF-41]
 }
 const _TokenType_name = "LEFT_PARENRIGHT_PARENLEFT_BRACERIGHT_BRACECOMMADOTMINUSPLUSSEMICOLONSTARBANGBANG_EQUALEQUALEQUAL_EQUALGREATERGREATER_EQUALLESSLESS_EQUALSLASHSLASH_DOTSLASH_UNDERSCOREIDENTIFIERSTRINGINTEGERFLOATANDCLASSELSEFALSEFORFUNIFNILORPRINTRETURNSUPERTHISTRUEVARWHILEEOF"
 var _TokenType_index = [...]uint16{0, 10, 21, 31, 42, 47, 50, 55, 59, 68, 72, 76, 86, 91, 102, 109, 122, 126, 136, 141, 150, 166, 176, 182, 189, 194, 197, 202, 206, 211, 214, 217, 219, 222, 224, 229, 235, 240, 244, 248, 251, 256, 259}
 func (i TokenType) String() string {
 	if i < 0 || i >= TokenType(len(_TokenType_index)-1) {
 		return "TokenType(" + strconv.FormatInt(int64(i), 10) + ")"
 	}
 	return _TokenType_name[_TokenType_index[i]:_TokenType_index[i+1]]
 }
--- a/pj1-go/tools.go
+++ b/pj1-go/tools.go
@ -0,0 +1,8 @@
 //go:build tools
 // +build tools
 package tools
 import (
 	_ "golang.org/x/tools/cmd/stringer"
 )