spooky/lib/lexer.ml

136 lines
3.7 KiB
OCaml
Raw Normal View History

2026-04-29 15:50:03 +00:00
type token =
| TIntKw
| TBoolKw
| TVoidKw
| TStructKw
| TIf
| TElse
| TFor
| TEach
| TForEach
| TIn
| TReturn
| TTrue
| TFalse
| TIdent of string
| TIntLit of int
| TLParen
| TRParen
| TLBrace
| TRBrace
| TLBracket
| TRBracket
| TSemicolon
| TComma
| TDot
| TAssign
| TPlus
| TMinus
| TStar
| TSlash
| TPercent
| TAndAnd
| TOrOr
| TBang
| TEqEq
| TNe
| TLt
| TLe
| TGt
| TGe
| TEOF
exception Lex_error of string
let is_space = function ' ' | '\t' | '\r' | '\n' -> true | _ -> false
let is_digit c = c >= '0' && c <= '9'
let is_ident_start c =
(c >= 'a' && c <= 'z') || (c >= 'A' && c <= 'Z') || c = '_'
let is_ident_char c = is_ident_start c || is_digit c
let keyword_or_ident s =
match s with
| "int" -> TIntKw
| "bool" -> TBoolKw
| "void" -> TVoidKw
| "struct" -> TStructKw
| "if" -> TIf
| "else" -> TElse
| "for" -> TFor
| "each" -> TEach
| "foreach" -> TForEach
| "in" -> TIn
| "return" -> TReturn
| "true" -> TTrue
| "false" -> TFalse
| _ -> TIdent s
let lex (src : string) : token list =
let n = String.length src in
let rec skip_line_comment i =
if i >= n then i
else if src.[i] = '\n' then i + 1
else skip_line_comment (i + 1)
in
let rec skip_block_comment i =
if i + 1 >= n then raise (Lex_error "unterminated block comment")
else if src.[i] = '*' && src.[i + 1] = '/' then i + 2
else skip_block_comment (i + 1)
in
let rec read_number i j =
if j < n && is_digit src.[j] then read_number i (j + 1)
else
let s = String.sub src i (j - i) in
(TIntLit (int_of_string s), j)
in
let rec read_ident i j =
if j < n && is_ident_char src.[j] then read_ident i (j + 1)
else
let s = String.sub src i (j - i) in
(keyword_or_ident s, j)
in
let rec loop i acc =
if i >= n then List.rev (TEOF :: acc)
else if is_space src.[i] then loop (i + 1) acc
else
match src.[i] with
| '/' when i + 1 < n && src.[i + 1] = '/' -> loop (skip_line_comment (i + 2)) acc
| '/' when i + 1 < n && src.[i + 1] = '*' -> loop (skip_block_comment (i + 2)) acc
| '(' -> loop (i + 1) (TLParen :: acc)
| ')' -> loop (i + 1) (TRParen :: acc)
| '{' -> loop (i + 1) (TLBrace :: acc)
| '}' -> loop (i + 1) (TRBrace :: acc)
| '[' -> loop (i + 1) (TLBracket :: acc)
| ']' -> loop (i + 1) (TRBracket :: acc)
| ';' -> loop (i + 1) (TSemicolon :: acc)
| ',' -> loop (i + 1) (TComma :: acc)
| '.' -> loop (i + 1) (TDot :: acc)
| '+' -> loop (i + 1) (TPlus :: acc)
| '-' -> loop (i + 1) (TMinus :: acc)
| '*' -> loop (i + 1) (TStar :: acc)
| '%' -> loop (i + 1) (TPercent :: acc)
| '/' -> loop (i + 1) (TSlash :: acc)
| '!' when i + 1 < n && src.[i + 1] = '=' -> loop (i + 2) (TNe :: acc)
| '!' -> loop (i + 1) (TBang :: acc)
| '=' when i + 1 < n && src.[i + 1] = '=' -> loop (i + 2) (TEqEq :: acc)
| '=' -> loop (i + 1) (TAssign :: acc)
| '&' when i + 1 < n && src.[i + 1] = '&' -> loop (i + 2) (TAndAnd :: acc)
| '|' when i + 1 < n && src.[i + 1] = '|' -> loop (i + 2) (TOrOr :: acc)
| '<' when i + 1 < n && src.[i + 1] = '=' -> loop (i + 2) (TLe :: acc)
| '<' -> loop (i + 1) (TLt :: acc)
| '>' when i + 1 < n && src.[i + 1] = '=' -> loop (i + 2) (TGe :: acc)
| '>' -> loop (i + 1) (TGt :: acc)
| c when is_digit c ->
let tok, j = read_number i (i + 1) in
loop j (tok :: acc)
| c when is_ident_start c ->
let tok, j = read_ident i (i + 1) in
loop j (tok :: acc)
| c ->
let msg = Printf.sprintf "unexpected character: %c" c in
raise (Lex_error msg)
in
loop 0 []