package main import ( "fmt" "math/big" "strconv" "strings" "unicode/utf8" "git.bonsai.cool/kayprish/pj1/pj1-go/util" ) type TokenType int const ( // Single-character tokens. LEFT_PAREN TokenType = iota RIGHT_PAREN LEFT_BRACE RIGHT_BRACE COMMA DOT MINUS PLUS SEMICOLON // One or two character tokens. STAR STAR_STAR BANG BANG_EQUAL EQUAL EQUAL_EQUAL GREATER GREATER_EQUAL LESS LESS_EQUAL SLASH SLASH_DOT SLASH_UNDERSCORE SLASH_MODULO MODULO // Literals IDENTIFIER STRING INTEGER FLOAT // Keywords. AND CLASS ELSE FALSE FOR FUN IF NIL OR PRINT RETURN SUPER THIS TRUE VAR WHILE EOF ) //go:generate go run golang.org/x/tools/cmd/stringer -type=TokenType var keywords = map[string]TokenType{ "and": AND, "class": CLASS, "else": ELSE, "false": FALSE, "for": FOR, "fun": FUN, "if": IF, "nil": NIL, "or": OR, "print": PRINT, "return": RETURN, "super": SUPER, "this": THIS, "true": TRUE, "var": VAR, "while": WHILE, } type Token struct { ttype TokenType Lexeme string literal interface{} line int } func NewToken(ttype TokenType, lexeme string, literal interface{}, line int) Token { t := Token{ttype, lexeme, literal, line} return t } func (t Token) String() string { return fmt.Sprintf("%v %v %v", t.ttype, t.Lexeme, t.literal) } type Lexer struct { source string Tokens []Token startByte int currentByte int startRune int currentRune int line int } func NewLexer(source string) Lexer { l := Lexer{source, []Token{}, 0, 0, 0, 0, 1} return l } func (l *Lexer) ScanTokens() { for !l.atEnd() { l.startByte = l.currentByte l.startRune = l.currentRune l.scanToken() } l.Tokens = append(l.Tokens, NewToken(EOF, "", nil, l.line)) } func (l Lexer) atEnd() bool { return l.currentByte >= len(l.source) } func (l *Lexer) scanToken() { c := l.advance() switch c { case '(': l.addSimpleToken(LEFT_PAREN) case ')': l.addSimpleToken(RIGHT_PAREN) case '{': l.addSimpleToken(LEFT_BRACE) case '}': l.addSimpleToken(RIGHT_BRACE) case ',': l.addSimpleToken(COMMA) case '.': l.addSimpleToken(DOT) case '-': l.addSimpleToken(MINUS) case '+': l.addSimpleToken(PLUS) case ';': l.addSimpleToken(SEMICOLON) case '*': if l.match('*') { l.addSimpleToken(STAR_STAR) } else { l.addSimpleToken(STAR) } l.addSimpleToken(STAR) case '!': if l.match('=') { l.addSimpleToken(BANG_EQUAL) } else { l.addSimpleToken(BANG) } case '=': if l.match('=') { l.addSimpleToken(EQUAL_EQUAL) } else { l.addSimpleToken(EQUAL) } case '<': if l.match('=') { l.addSimpleToken(LESS_EQUAL) } else { l.addSimpleToken(LESS) } case '>': if l.match('=') { l.addSimpleToken(GREATER_EQUAL) } else { l.addSimpleToken(GREATER) } case '%': l.addSimpleToken(MODULO) case '/': if l.match('/') { // A comment goes until the end of the line for l.peek() != '\n' && !l.atEnd() { l.advance() } } else if l.match('*') { l.advance() l.advance() // swallow the "/*" nesting := 1 // A comment goes until we reach */, however, // the comments can also nest for { if l.peek() == '/' && l.peekNext() == '*' { nesting++ l.advance() l.advance() } else if l.peek() == '*' && l.peekNext() == '/' { nesting-- l.advance() l.advance() } else { if l.peek() == '\n' { l.line++ } l.advance() } } } else if l.match('.') { l.addSimpleToken(SLASH_DOT) } else if l.match('_') { l.addSimpleToken(SLASH_UNDERSCORE) } else if l.match('%') { l.addSimpleToken(SLASH_MODULO) } else { l.addSimpleToken(SLASH) } case ' ', '\r', '\t': case '\n': l.line++ case '"': l.str() default: if isDigit(c) { l.number() } else if isAlpha(c) { l.identifier() } else { // TODO: if there are multiple bad characters // coalesce similar errors into one util.Error(l.line, fmt.Sprintf("Unexpected character, %v.", c)) } } } func (l *Lexer) identifier() { for isAlphaNumeric(l.peek()) { l.advance() } text := l.source[l.startByte:l.currentByte] ttype, ok := keywords[text] if !ok { ttype = IDENTIFIER } l.addSimpleToken(ttype) } func (l *Lexer) number() { isInt := true for isDigit(l.peek()) { l.advance() } if l.peek() == '.' && isDigit(l.peekNext()) { l.advance() isInt = false for isDigit(l.peek()) { l.advance() } } // Only allow integer and float literals, other values can be // made by combining these if isInt { var bigint big.Int bigint.SetString(l.source[l.startByte:l.currentByte], 10) l.addToken(INTEGER, bigint) } else { float, _ := strconv.ParseFloat(l.source[l.startByte:l.currentByte], 64) l.addToken(FLOAT, float) } } func (l *Lexer) match(expected rune) bool { if l.atEnd() { return false } ch, width := utf8.DecodeRuneInString(l.source[l.currentByte:]) if ch == expected { l.currentRune++ l.currentByte += width return true } else { return false } } func (l Lexer) peek() rune { if l.atEnd() { return '\x00' } ch, _ := utf8.DecodeRuneInString(l.source[l.currentByte:]) return ch } func (l Lexer) peekNext() rune { _, width1 := utf8.DecodeRuneInString(l.source[l.currentByte:]) // "width1 == 0" signifies we reached the end of the string with the first character if width1 == 0 || l.currentByte+width1 >= len(l.source) { return '\x00' } ch2, _ := utf8.DecodeRuneInString(l.source[l.currentByte+width1:]) return ch2 } func isAlpha(c rune) bool { return (c >= 'a' && c <= 'z') || (c >= 'A' && c <= 'Z') || c == '_' } func isAlphaNumeric(c rune) bool { return isAlpha(c) || isDigit(c) } func isDigit(c rune) bool { return c >= '0' && c <= '9' } func (l *Lexer) advance() rune { ch, width := utf8.DecodeRuneInString(l.source[l.currentByte:]) l.currentRune++ l.currentByte += width return ch } func (l *Lexer) str() { var sb strings.Builder for l.peek() != '"' && !l.atEnd() { c := l.peek() if c == '\n' { l.line++ } else if c == '\\' { l.advance() // TODO: add more escape sequences, including \xNN switch l.peek() { case 'n': c = '\n' case 'r': c = '\r' case 't': c = '\t' case '\\': c = '\\' case '"': c = '"' case '\'': c = '\'' case 'e': c = '\x1b' default: util.Error(l.line, fmt.Sprintf("Invalid escape sequence \\%v.", l.peek())) return } } sb.WriteRune(c) l.advance() } if l.atEnd() { util.Error(l.line, "Unterminated string.") return } // Closing ". l.advance() value := sb.String() l.addToken(STRING, value) } // Simple refers to "having no literal" (TODO: rename function) func (l *Lexer) addSimpleToken(ttype TokenType) { l.addToken(ttype, nil) } func (l *Lexer) addToken(ttype TokenType, literal interface{}) { text := l.source[l.startByte:l.currentByte] l.Tokens = append(l.Tokens, NewToken(ttype, text, literal, l.line)) }