elysium/lexer.go

227 lines
4.8 KiB
Go
Raw Normal View History

2024-03-10 22:48:57 +01:00
package main
import (
"errors"
"slices"
2024-03-11 22:05:36 +01:00
"strconv"
2024-03-10 22:48:57 +01:00
"strings"
2024-03-11 22:05:36 +01:00
"unicode"
2024-03-10 22:48:57 +01:00
)
var Whitespace []rune = []rune{' ', '\t', '\r', '\n'}
2024-03-11 22:05:36 +01:00
var Separators []rune = []rune{'(', ')', '{', '}', ';', ','}
var Operators []rune = []rune{'=', '>', '<', '!', '+', '-', '*', '/', '%'}
2024-03-10 22:48:57 +01:00
type LexType uint32
const (
Type_Identifier LexType = iota
Type_Keyword
Type_Separator
2024-03-11 22:05:36 +01:00
Type_Literal
2024-03-10 22:48:57 +01:00
)
type Keyword uint32
const (
Keyword_Import Keyword = iota
Keyword_Void
)
type Separator uint32
const (
Separator_OpenParen Separator = iota
Separator_CloseParen
Separator_OpenCurly
Separator_CloseCurly
Separator_Semicolon
2024-03-11 22:05:36 +01:00
Separator_Comma
2024-03-10 22:48:57 +01:00
)
type LiteralType uint32
const (
Literal_String LiteralType = iota
Literal_Number
)
type LexToken struct {
Type LexType
Position uint64
Value any
}
type Literal struct {
2024-03-11 22:05:36 +01:00
Type LiteralType
Primitive PrimitiveType
Value any
2024-03-10 22:48:57 +01:00
}
func stringLiteral(runes []rune) (string, []rune, error) {
idx := 1 // Always starts with "
literal := ""
for idx < len(runes) && runes[idx] != '"' {
if runes[idx] == '\\' {
if idx == len(runes)-1 {
return "", nil, errors.New("unmatched escape sequence")
}
// TODO \n, \r, \uXXXX, ... escape sequences
idx++
}
literal += string(runes[idx])
idx++
}
if idx == len(runes) {
return "", nil, errors.New("unclosed string literal")
}
idx++
return literal, runes[idx:], nil
}
// source -> token, remaining source, error
func nextToken(program string) (string, string, error) {
// Skip whitespace
start := 0
runes := []rune(program)
for start < len(runes) && slices.Contains(Whitespace, runes[start]) {
start++
}
if start == len(runes) {
return "", "", nil
}
if runes[start] == '"' {
// String literal
literal, remaining, err := stringLiteral(runes[start:])
if err != nil {
return "", "", err
}
return "\"" + literal + "\"", string(remaining), nil
}
end := start
for end < len(runes) && !slices.Contains(Whitespace, runes[end]) && !slices.Contains(Separators, runes[end]) {
end++
}
if start == end {
end++
}
return string(runes[start:end]), string(runes[end:]), nil
}
2024-03-11 22:05:36 +01:00
func parseNumber(raw string, numberType PrimitiveType) (any, error) {
if isSignedInt(numberType) {
return strconv.ParseInt(raw, 10, getBits(numberType))
}
if isUnsignedInt(numberType) {
return strconv.ParseUint(raw, 10, getBits(numberType))
}
if isFloatingPoint(numberType) {
return strconv.ParseFloat(raw, getBits(numberType))
}
panic("Unhandled type (" + strconv.FormatUint(uint64(numberType), 10) + ") in parseNumber()")
}
func parseToken(token string) (*LexToken, error) {
2024-03-10 22:48:57 +01:00
if strings.HasPrefix(token, "\"") {
2024-03-11 22:05:36 +01:00
return &LexToken{Type: Type_Literal, Value: Literal{Type: Literal_String, Primitive: InvalidValue, Value: token[1 : len(token)-1]}}, nil
}
runes := []rune(token)
startsWithMinus := runes[0] == '-'
if startsWithMinus || unicode.IsDigit([]rune(token)[0]) {
// TODO: hexadecimal/binary/octal constants
var numberType PrimitiveType = InvalidValue
var rawNumber string = token
for i, name := range NumberTypeNames {
if strings.HasSuffix(token, name) {
numberType = PrimitiveType(i)
rawNumber = token[:len(token)-len(name)]
}
}
containsDot := slices.Contains(runes, '.')
if numberType == InvalidValue {
if containsDot {
numberType = Primitive_F64
} else if startsWithMinus {
numberType = Primitive_I64
} else {
numberType = Primitive_U64
}
}
if containsDot && !isFloatingPoint(numberType) {
return nil, errors.New("dot in non floating-point constant")
}
number, err := parseNumber(rawNumber, numberType)
if err != nil {
return nil, err
}
return &LexToken{Type: Type_Literal, Value: Literal{Type: Literal_Number, Primitive: numberType, Value: number}}, nil
2024-03-10 22:48:57 +01:00
}
switch token {
case "void":
2024-03-11 22:05:36 +01:00
return &LexToken{Type: Type_Keyword, Value: Keyword_Void}, nil
2024-03-10 22:48:57 +01:00
case "import":
2024-03-11 22:05:36 +01:00
return &LexToken{Type: Type_Keyword, Value: Keyword_Import}, nil
2024-03-10 22:48:57 +01:00
case "(":
2024-03-11 22:05:36 +01:00
return &LexToken{Type: Type_Separator, Value: Separator_OpenParen}, nil
2024-03-10 22:48:57 +01:00
case ")":
2024-03-11 22:05:36 +01:00
return &LexToken{Type: Type_Separator, Value: Separator_CloseParen}, nil
2024-03-10 22:48:57 +01:00
case "{":
2024-03-11 22:05:36 +01:00
return &LexToken{Type: Type_Separator, Value: Separator_OpenCurly}, nil
2024-03-10 22:48:57 +01:00
case "}":
2024-03-11 22:05:36 +01:00
return &LexToken{Type: Type_Separator, Value: Separator_CloseCurly}, nil
2024-03-10 22:48:57 +01:00
case ";":
2024-03-11 22:05:36 +01:00
return &LexToken{Type: Type_Separator, Value: Separator_Semicolon}, nil
case ",":
return &LexToken{Type: Type_Separator, Value: Separator_Comma}, nil
2024-03-10 22:48:57 +01:00
default:
2024-03-11 22:05:36 +01:00
return &LexToken{Type: Type_Identifier, Value: token}, nil
2024-03-10 22:48:57 +01:00
}
}
func lexer(program string) ([]LexToken, error) {
var tokens []LexToken
for len(program) > 0 {
token, rest, err := nextToken(program)
if err != nil {
return nil, err
}
if len(token) == 0 {
break
}
lexToken, err := parseToken(token)
if err != nil {
return nil, err
}
program = rest
2024-03-11 22:05:36 +01:00
tokens = append(tokens, *lexToken)
2024-03-10 22:48:57 +01:00
}
return tokens, nil
}