type token = | TIntKw | TBoolKw | TVoidKw | TStructKw | TIf | TElse | TFor | TEach | TForEach | TIn | TReturn | TTrue | TFalse | TFn | TLet | TMut | TIdent of string | TIntLit of int | TLParen | TRParen | TLBrace | TRBrace | TLBracket | TRBracket | TSemicolon | TComma | TDot | TAssign | TPlus | TMinus | TStar | TSlash | TPercent | TAndAnd | TOrOr | TBang | TEqEq | TNe | TLt | TLe | TGt | TGe | TEOF | TArrow | TColon | THash exception Lex_error of string let is_space = function ' ' | '\t' | '\r' | '\n' -> true | _ -> false let is_digit c = c >= '0' && c <= '9' let is_ident_start c = (c >= 'a' && c <= 'z') || (c >= 'A' && c <= 'Z') || c = '_' let is_ident_char c = is_ident_start c || is_digit c let keyword_or_ident s = match s with | "int" -> TIntKw | "bool" -> TBoolKw | "void" -> TVoidKw | "struct" -> TStructKw | "if" -> TIf | "else" -> TElse | "for" -> TFor | "each" -> TEach | "foreach" -> TForEach | "in" -> TIn | "return" -> TReturn | "true" -> TTrue | "false" -> TFalse | "fn" -> TFn | "let" -> TLet | "mut" -> TMut | _ -> TIdent s let lex (src : string) : token list = let n = String.length src in let rec skip_line_comment i = if i >= n then i else if src.[i] = '\n' then i + 1 else skip_line_comment (i + 1) in let rec skip_block_comment i = if i + 1 >= n then raise (Lex_error "unterminated block comment") else if src.[i] = '*' && src.[i + 1] = '/' then i + 2 else skip_block_comment (i + 1) in let rec read_number i j = if j < n && is_digit src.[j] then read_number i (j + 1) else let s = String.sub src i (j - i) in (TIntLit (int_of_string s), j) in let rec read_ident i j = if j < n && is_ident_char src.[j] then read_ident i (j + 1) else let s = String.sub src i (j - i) in (keyword_or_ident s, j) in let rec loop i acc = if i >= n then List.rev (TEOF :: acc) else if is_space src.[i] then loop (i + 1) acc else match src.[i] with | '/' when i + 1 < n && src.[i + 1] = '/' -> loop (skip_line_comment (i + 2)) acc | '/' when i + 1 < n && src.[i + 1] = '*' -> loop (skip_block_comment (i + 2)) acc | '(' -> loop (i + 1) (TLParen :: acc) | ')' -> loop (i + 1) (TRParen :: acc) | '{' -> loop (i + 1) (TLBrace :: acc) | '}' -> loop (i + 1) (TRBrace :: acc) | '[' -> loop (i + 1) (TLBracket :: acc) | ']' -> loop (i + 1) (TRBracket :: acc) | ';' -> loop (i + 1) (TSemicolon :: acc) | ',' -> loop (i + 1) (TComma :: acc) | '.' -> loop (i + 1) (TDot :: acc) | '+' -> loop (i + 1) (TPlus :: acc) | '-' when i + 1 < n && src.[i + 1] = '>' -> loop (i + 2) (TArrow :: acc) | '-' -> loop (i + 1) (TMinus :: acc) | '*' -> loop (i + 1) (TStar :: acc) | '%' -> loop (i + 1) (TPercent :: acc) | '/' -> loop (i + 1) (TSlash :: acc) | '!' when i + 1 < n && src.[i + 1] = '=' -> loop (i + 2) (TNe :: acc) | '!' -> loop (i + 1) (TBang :: acc) | '=' when i + 1 < n && src.[i + 1] = '=' -> loop (i + 2) (TEqEq :: acc) | '=' -> loop (i + 1) (TAssign :: acc) | '&' when i + 1 < n && src.[i + 1] = '&' -> loop (i + 2) (TAndAnd :: acc) | '|' when i + 1 < n && src.[i + 1] = '|' -> loop (i + 2) (TOrOr :: acc) | '<' when i + 1 < n && src.[i + 1] = '=' -> loop (i + 2) (TLe :: acc) | '<' -> loop (i + 1) (TLt :: acc) | '>' when i + 1 < n && src.[i + 1] = '=' -> loop (i + 2) (TGe :: acc) | '>' -> loop (i + 1) (TGt :: acc) | ':' -> loop (i + 1) (TColon :: acc) | '#' -> loop (i + 1) (THash :: acc) | c when is_digit c -> let tok, j = read_number i (i + 1) in loop j (tok :: acc) | c when is_ident_start c -> let tok, j = read_ident i (i + 1) in loop j (tok :: acc) | c -> let msg = Printf.sprintf "unexpected character: %c" c in raise (Lex_error msg) in loop 0 []