elysium/lexer.go

275 lines
5.5 KiB
Go

package main
import (
"errors"
"slices"
"strconv"
"strings"
"unicode"
)
var Whitespace []rune = []rune{' ', '\t', '\r', '\n'}
var Separators []rune = []rune{'(', ')', '{', '}', ';', ','}
var Operators []rune = []rune{'=', '>', '<', '!', '+', '-', '*', '/', '%'}
type LexType uint32
const (
Type_Identifier LexType = iota
Type_Keyword
Type_Separator
Type_Literal
)
type Keyword uint32
const (
Keyword_Import Keyword = iota
Keyword_Void
)
type Separator uint32
const (
Separator_OpenParen Separator = iota
Separator_CloseParen
Separator_OpenCurly
Separator_CloseCurly
Separator_Semicolon
Separator_Comma
)
type LiteralType uint32
const (
Literal_String LiteralType = iota
Literal_Number
)
type LexToken struct {
Type LexType
Position uint64
Value any
}
type Literal struct {
Type LiteralType
Primitive PrimitiveType
Value any
}
type Lexer struct {
Runes []rune
Position uint64
}
func (l *Lexer) error(message string) error {
return errors.New(message + " (at " + strconv.FormatUint(l.Position, 10) + ")")
}
func (l *Lexer) peekRune() *rune {
if len(l.Runes) == 0 {
return nil
}
return &l.Runes[0]
}
func (l *Lexer) nextRune() *rune {
if len(l.Runes) == 0 {
return nil
}
r := l.Runes[0]
l.Runes = l.Runes[1:]
l.Position++
return &r
}
func (l *Lexer) stringLiteral() (string, error) {
openQuote := l.nextRune()
if openQuote == nil || *openQuote != '"' {
return "", l.error("expected \"")
}
literal := ""
for {
r := l.nextRune()
if r == nil {
return "", l.error("unexpected end of file")
}
if *r == '"' {
break
}
if *r == '\\' {
escaped := l.nextRune()
if escaped == nil {
return "", l.error("unmatched escape sequence")
}
literal += string(*escaped)
continue
}
literal += string(*r)
}
return literal, nil
}
// TODO: maybe this method should directly return LexToken
func (l *Lexer) nextToken() (string, error) {
// Skip whitespace
for {
r := l.peekRune()
if r == nil {
return "", nil
}
if !slices.Contains(Whitespace, *r) {
break
}
l.nextRune()
}
r := l.peekRune()
if r == nil {
return "", nil
}
if *r == '"' {
literal, err := l.stringLiteral()
if err != nil {
return "", err
}
return "\"" + literal + "\"", nil
}
token := ""
for {
r := l.peekRune()
if r == nil || slices.Contains(Whitespace, *r) || slices.Contains(Separators, *r) {
break
}
token += string(*l.nextRune())
}
if len(token) == 0 && len(l.Runes) > 0 {
return string(*l.nextRune()), nil
}
return token, nil
}
func parseNumber(raw string, numberType PrimitiveType) (any, error) {
if isSignedInt(numberType) {
return strconv.ParseInt(raw, 10, getBits(numberType))
}
if isUnsignedInt(numberType) {
return strconv.ParseUint(raw, 10, getBits(numberType))
}
if isFloatingPoint(numberType) {
return strconv.ParseFloat(raw, getBits(numberType))
}
panic("Unhandled type (" + strconv.FormatUint(uint64(numberType), 10) + ") in parseNumber()")
}
func (l *Lexer) parseToken(token string) (*LexToken, error) {
if strings.HasPrefix(token, "\"") {
return &LexToken{Type: Type_Literal, Position: l.Position, Value: Literal{Type: Literal_String, Primitive: InvalidValue, Value: token[1 : len(token)-1]}}, nil
}
runes := []rune(token)
startsWithMinus := runes[0] == '-'
if startsWithMinus || unicode.IsDigit([]rune(token)[0]) {
// TODO: hexadecimal/binary/octal constants
var numberType PrimitiveType = InvalidValue
var rawNumber string = token
for i, name := range NumberTypeNames {
if strings.HasSuffix(token, name) {
numberType = PrimitiveType(i)
rawNumber = token[:len(token)-len(name)]
}
}
containsDot := slices.Contains(runes, '.')
if numberType == InvalidValue {
if containsDot {
numberType = Primitive_F64
} else if startsWithMinus {
numberType = Primitive_I64
} else {
numberType = Primitive_U64
}
}
if containsDot && !isFloatingPoint(numberType) {
return nil, errors.New("dot in non floating-point constant")
}
number, err := parseNumber(rawNumber, numberType)
if err != nil {
return nil, err
}
return &LexToken{Type: Type_Literal, Position: l.Position, Value: Literal{Type: Literal_Number, Primitive: numberType, Value: number}}, nil
}
switch token {
case "void":
return &LexToken{Type: Type_Keyword, Position: l.Position, Value: Keyword_Void}, nil
case "import":
return &LexToken{Type: Type_Keyword, Position: l.Position, Value: Keyword_Import}, nil
case "(":
return &LexToken{Type: Type_Separator, Position: l.Position, Value: Separator_OpenParen}, nil
case ")":
return &LexToken{Type: Type_Separator, Position: l.Position, Value: Separator_CloseParen}, nil
case "{":
return &LexToken{Type: Type_Separator, Position: l.Position, Value: Separator_OpenCurly}, nil
case "}":
return &LexToken{Type: Type_Separator, Position: l.Position, Value: Separator_CloseCurly}, nil
case ";":
return &LexToken{Type: Type_Separator, Position: l.Position, Value: Separator_Semicolon}, nil
case ",":
return &LexToken{Type: Type_Separator, Position: l.Position, Value: Separator_Comma}, nil
default:
return &LexToken{Type: Type_Identifier, Position: l.Position, Value: token}, nil
}
}
func lexer(program string) ([]LexToken, error) {
var tokens []LexToken
lexer := Lexer{Runes: []rune(program)}
for {
token, err := lexer.nextToken()
if err != nil {
return nil, err
}
if len(token) == 0 {
break
}
lexToken, err := lexer.parseToken(token)
if err != nil {
return nil, err
}
tokens = append(tokens, *lexToken)
}
return tokens, nil
}