elysium/lexer.go

408 lines
8.0 KiB
Go
Raw Permalink Normal View History

2024-03-10 22:48:57 +01:00
package main
import (
2024-03-28 18:20:52 +01:00
"fmt"
2024-03-10 22:48:57 +01:00
"slices"
2024-03-11 22:05:36 +01:00
"strconv"
2024-03-10 22:48:57 +01:00
"strings"
2024-03-11 22:05:36 +01:00
"unicode"
2024-03-10 22:48:57 +01:00
)
var Whitespace []rune = []rune{' ', '\t', '\r', '\n'}
type LexType uint32
const (
Type_Identifier LexType = iota
Type_Keyword
Type_Separator
2024-03-11 22:05:36 +01:00
Type_Literal
2024-03-13 23:26:20 +01:00
Type_Operator
2024-03-10 22:48:57 +01:00
)
type Keyword uint32
var Keywords []string = []string{"import", "module", "void", "return", "true", "false", "if", "else", "raw", "while"}
2024-03-14 16:42:22 +01:00
2024-03-10 22:48:57 +01:00
const (
Keyword_Import Keyword = iota
Keyword_Module
2024-03-10 22:48:57 +01:00
Keyword_Void
2024-03-13 23:26:20 +01:00
Keyword_Return
2024-03-14 16:42:22 +01:00
Keyword_True
KeyWord_False
2024-03-23 14:03:20 +01:00
Keyword_If
Keyword_Else
Keyword_Raw
Keyword_While
2024-03-10 22:48:57 +01:00
)
type Separator uint32
2024-03-31 20:33:50 +02:00
var Separators []string = []string{"(", ")", "{", "}", "[", "]", ";", ",", "."}
2024-03-14 16:42:22 +01:00
2024-03-10 22:48:57 +01:00
const (
Separator_OpenParen Separator = iota
Separator_CloseParen
Separator_OpenCurly
Separator_CloseCurly
2024-03-28 18:20:52 +01:00
Separator_OpenSquare
Separator_CloseSquare
2024-03-10 22:48:57 +01:00
Separator_Semicolon
2024-03-11 22:05:36 +01:00
Separator_Comma
Separator_Dot
2024-03-10 22:48:57 +01:00
)
2024-03-13 23:26:20 +01:00
type Operator uint32
2024-04-09 13:57:17 +02:00
var Operators []string = []string{"=", ">", "<", "!", "+", "-", "*", "/", "%", "==", ">=", "<=", "!=", "+=", "-=", "*=", "/=", "%=", "++", "--", "~"}
2024-03-14 16:42:22 +01:00
2024-03-13 23:26:20 +01:00
const (
Operator_Equals Operator = iota
Operator_Greater
Operator_Less
Operator_Not
Operator_Plus
Operator_Minus
Operator_Multiply
Operator_Divide
Operator_Modulo
2024-03-23 14:03:20 +01:00
Operator_EqualsEquals
Operator_GreaterEquals
Operator_LessEquals
Operator_NotEquals
Operator_PlusEquals
Operator_MinusEquals
Operator_MultiplyEquals
Operator_DivideEquals
Operator_ModuloEquals
2024-04-09 13:57:17 +02:00
Operator_PlusPlus
Operator_MinusMinus
Operator_BitwiseNot
2024-03-13 23:26:20 +01:00
)
2024-03-10 22:48:57 +01:00
type LiteralType uint32
const (
Literal_String LiteralType = iota
Literal_Number
2024-03-14 16:42:22 +01:00
Literal_Boolean
2024-03-10 22:48:57 +01:00
)
type LexToken struct {
Type LexType
Position TokenPosition
2024-03-10 22:48:57 +01:00
Value any
}
type TokenPosition struct {
SourceFile string
Position uint64
}
2024-03-10 22:48:57 +01:00
type Literal struct {
2024-03-11 22:05:36 +01:00
Type LiteralType
Primitive PrimitiveType
Value any
2024-03-10 22:48:57 +01:00
}
2024-03-13 17:17:09 +01:00
type Lexer struct {
2024-03-21 19:55:05 +01:00
Runes []rune
LastTokenPosition uint64
SourceFile string
2024-03-21 19:55:05 +01:00
Position uint64
2024-03-13 17:17:09 +01:00
}
2024-03-10 22:48:57 +01:00
func unknownPosition() TokenPosition {
return TokenPosition{SourceFile: "", Position: 0}
}
2024-03-13 17:17:09 +01:00
func (l *Lexer) error(message string) error {
return CompilerError{Position: TokenPosition{SourceFile: l.SourceFile, Position: l.LastTokenPosition}, Message: message}
2024-03-13 17:17:09 +01:00
}
2024-03-10 22:48:57 +01:00
2024-03-13 17:17:09 +01:00
func (l *Lexer) peekRune() *rune {
if len(l.Runes) == 0 {
return nil
}
return &l.Runes[0]
}
2024-03-23 14:03:20 +01:00
func (l *Lexer) tryOperator() Operator {
var foundOp Operator = InvalidValue
var foundOpLen int = 0
str := string(l.Runes)
for i, operator := range Operators {
operatorLen := len([]rune(operator))
if operatorLen <= foundOpLen {
continue
}
if strings.HasPrefix(str, operator) {
foundOp = Operator(i)
foundOpLen = len([]rune(operator))
}
}
for i := 0; i < foundOpLen; i++ {
l.nextRune()
}
return foundOp
}
2024-03-31 20:33:50 +02:00
func (l *Lexer) trySeparator() Separator {
var foundSep Separator = InvalidValue
var foundSepLen int = 0
str := string(l.Runes)
for i, separator := range Separators {
separatorLen := len([]rune(separator))
if separatorLen <= foundSepLen {
continue
}
if strings.HasPrefix(str, separator) {
foundSep = Separator(i)
foundSepLen = len([]rune(separator))
}
}
for i := 0; i < foundSepLen; i++ {
l.nextRune()
}
return foundSep
}
func (l *Lexer) hasNext(choices ...string) bool {
str := string(l.Runes)
for _, c := range choices {
if strings.HasPrefix(str, c) {
return true
}
}
return false
}
2024-03-13 17:17:09 +01:00
func (l *Lexer) nextRune() *rune {
if len(l.Runes) == 0 {
return nil
}
r := l.Runes[0]
l.Runes = l.Runes[1:]
l.Position++
return &r
}
2024-03-10 22:48:57 +01:00
2024-03-13 17:17:09 +01:00
func (l *Lexer) stringLiteral() (string, error) {
2024-03-21 19:55:05 +01:00
l.LastTokenPosition = l.Position
2024-03-13 17:17:09 +01:00
openQuote := l.nextRune()
if openQuote == nil || *openQuote != '"' {
return "", l.error("expected \"")
2024-03-10 22:48:57 +01:00
}
2024-03-13 17:17:09 +01:00
literal := ""
for {
r := l.nextRune()
if r == nil {
return "", l.error("unexpected end of file")
}
if *r == '"' {
break
}
if *r == '\\' {
escaped := l.nextRune()
if escaped == nil {
return "", l.error("unmatched escape sequence")
}
literal += string(*escaped)
continue
}
literal += string(*r)
2024-03-10 22:48:57 +01:00
}
2024-03-13 17:17:09 +01:00
return literal, nil
2024-03-10 22:48:57 +01:00
}
2024-03-23 14:03:20 +01:00
func (l *Lexer) nextToken() (*LexToken, error) {
2024-03-10 22:48:57 +01:00
// Skip whitespace
2024-03-13 17:17:09 +01:00
for {
r := l.peekRune()
if r == nil {
2024-03-23 14:03:20 +01:00
return nil, nil
2024-03-13 17:17:09 +01:00
}
if !slices.Contains(Whitespace, *r) {
break
}
l.nextRune()
2024-03-10 22:48:57 +01:00
}
2024-03-21 19:55:05 +01:00
l.LastTokenPosition = l.Position
2024-03-13 17:17:09 +01:00
r := l.peekRune()
if r == nil {
2024-03-23 14:03:20 +01:00
return nil, nil
2024-03-10 22:48:57 +01:00
}
2024-03-13 17:17:09 +01:00
if *r == '"' {
literal, err := l.stringLiteral()
2024-03-10 22:48:57 +01:00
if err != nil {
2024-03-23 14:03:20 +01:00
return nil, err
2024-03-10 22:48:57 +01:00
}
return &LexToken{Type: Type_Literal, Position: TokenPosition{SourceFile: l.SourceFile, Position: l.LastTokenPosition}, Value: Literal{Type: Literal_String, Primitive: InvalidValue, Value: literal}}, nil
2024-03-23 14:03:20 +01:00
}
op := l.tryOperator()
if op != InvalidValue {
return &LexToken{Type: Type_Operator, Position: TokenPosition{SourceFile: l.SourceFile, Position: l.LastTokenPosition}, Value: op}, nil
2024-03-10 22:48:57 +01:00
}
2024-03-31 20:33:50 +02:00
sep := l.trySeparator()
if sep != InvalidValue {
return &LexToken{Type: Type_Separator, Position: TokenPosition{SourceFile: l.SourceFile, Position: l.LastTokenPosition}, Value: sep}, nil
}
2024-03-13 17:17:09 +01:00
token := ""
for {
r := l.peekRune()
2024-03-31 20:33:50 +02:00
if r == nil || slices.Contains(Whitespace, *r) || l.hasNext(Separators...) || l.hasNext(Operators...) {
2024-03-13 17:17:09 +01:00
break
}
token += string(*l.nextRune())
2024-03-10 22:48:57 +01:00
}
2024-03-23 14:03:20 +01:00
if len(token) == 0 {
if len(l.Runes) == 0 {
return nil, nil
}
2024-03-11 22:05:36 +01:00
2024-03-23 14:03:20 +01:00
token = string(*l.nextRune())
2024-03-11 22:05:36 +01:00
}
runes := []rune(token)
if unicode.IsDigit([]rune(token)[0]) {
2024-03-11 22:05:36 +01:00
var numberType PrimitiveType = InvalidValue
var rawNumber string = token
2024-03-17 19:55:28 +01:00
for i, name := range PRIMITIVE_TYPE_NAMES {
2024-03-11 22:05:36 +01:00
if strings.HasSuffix(token, name) {
numberType = PrimitiveType(i)
rawNumber = token[:len(token)-len(name)]
}
}
containsDot := slices.Contains(runes, '.')
if numberType == InvalidValue {
if containsDot {
numberType = Primitive_F64
} else {
numberType = Primitive_I64
2024-03-11 22:05:36 +01:00
}
}
if containsDot && !isFloatingPoint(numberType) {
2024-03-21 19:55:05 +01:00
return nil, l.error("dot in non floating-point constant")
2024-03-11 22:05:36 +01:00
}
2024-03-28 18:20:52 +01:00
number, err := l.parseNumber(rawNumber, numberType)
2024-03-11 22:05:36 +01:00
if err != nil {
return nil, err
}
return &LexToken{Type: Type_Literal, Position: TokenPosition{SourceFile: l.SourceFile, Position: l.LastTokenPosition}, Value: Literal{Type: Literal_Number, Primitive: numberType, Value: number}}, nil
2024-03-10 22:48:57 +01:00
}
2024-03-14 16:42:22 +01:00
if idx := slices.Index(Keywords, token); idx != -1 {
return &LexToken{Type: Type_Keyword, Position: TokenPosition{SourceFile: l.SourceFile, Position: l.LastTokenPosition}, Value: Keyword(idx)}, nil
2024-03-14 16:42:22 +01:00
}
return &LexToken{Type: Type_Identifier, Position: TokenPosition{SourceFile: l.SourceFile, Position: l.LastTokenPosition}, Value: token}, nil
2024-03-10 22:48:57 +01:00
}
2024-03-28 18:20:52 +01:00
func (l *Lexer) parseNumber(raw string, numberType PrimitiveType) (any, error) {
if numberType == Primitive_Bool {
return nil, l.error("bool not allowed as number suffix")
}
base := 10
if strings.HasPrefix(raw, "0x") {
raw = raw[2:]
base = 16
} else if strings.HasPrefix(raw, "0o") {
raw = raw[2:]
base = 8
} else if strings.HasPrefix(raw, "0b") {
raw = raw[2:]
base = 2
}
2024-03-23 14:03:20 +01:00
if isSignedInt(numberType) {
2024-03-28 18:20:52 +01:00
num, err := strconv.ParseInt(raw, base, getBits(numberType))
if err != nil {
return nil, l.error("failed to parse literal: " + err.Error())
}
return num, nil
2024-03-23 14:03:20 +01:00
}
if isUnsignedInt(numberType) {
2024-03-28 18:20:52 +01:00
num, err := strconv.ParseUint(raw, base, getBits(numberType))
if err != nil {
return nil, l.error("failed to parse literal: " + err.Error())
}
return num, nil
2024-03-23 14:03:20 +01:00
}
if isFloatingPoint(numberType) {
2024-03-28 18:20:52 +01:00
if base != 10 {
return nil, l.error("non base 10 float literals are not supported")
}
num, err := strconv.ParseFloat(raw, getBits(numberType))
if err != nil {
return nil, l.error("failed to parse literal: " + err.Error())
}
return num, nil
2024-03-23 14:03:20 +01:00
}
2024-03-28 18:20:52 +01:00
panic(fmt.Sprintf("Unhandled type %s in parseNumber()", numberType))
2024-03-23 14:03:20 +01:00
}
func lexer(sourceFile string, source string) ([]LexToken, error) {
2024-03-10 22:48:57 +01:00
var tokens []LexToken
lexer := Lexer{SourceFile: sourceFile, Runes: []rune(source)}
2024-03-13 17:17:09 +01:00
for {
token, err := lexer.nextToken()
2024-03-10 22:48:57 +01:00
if err != nil {
return nil, err
}
2024-03-23 14:03:20 +01:00
if token == nil {
2024-03-10 22:48:57 +01:00
break
}
2024-03-23 14:03:20 +01:00
tokens = append(tokens, *token)
2024-03-10 22:48:57 +01:00
}
return tokens, nil
}