136 lines
3.7 KiB
OCaml
136 lines
3.7 KiB
OCaml
type token =
|
|
| TIntKw
|
|
| TBoolKw
|
|
| TVoidKw
|
|
| TStructKw
|
|
| TIf
|
|
| TElse
|
|
| TFor
|
|
| TEach
|
|
| TForEach
|
|
| TIn
|
|
| TReturn
|
|
| TTrue
|
|
| TFalse
|
|
| TIdent of string
|
|
| TIntLit of int
|
|
| TLParen
|
|
| TRParen
|
|
| TLBrace
|
|
| TRBrace
|
|
| TLBracket
|
|
| TRBracket
|
|
| TSemicolon
|
|
| TComma
|
|
| TDot
|
|
| TAssign
|
|
| TPlus
|
|
| TMinus
|
|
| TStar
|
|
| TSlash
|
|
| TPercent
|
|
| TAndAnd
|
|
| TOrOr
|
|
| TBang
|
|
| TEqEq
|
|
| TNe
|
|
| TLt
|
|
| TLe
|
|
| TGt
|
|
| TGe
|
|
| TEOF
|
|
|
|
exception Lex_error of string
|
|
|
|
let is_space = function ' ' | '\t' | '\r' | '\n' -> true | _ -> false
|
|
let is_digit c = c >= '0' && c <= '9'
|
|
|
|
let is_ident_start c =
|
|
(c >= 'a' && c <= 'z') || (c >= 'A' && c <= 'Z') || c = '_'
|
|
|
|
let is_ident_char c = is_ident_start c || is_digit c
|
|
|
|
let keyword_or_ident s =
|
|
match s with
|
|
| "int" -> TIntKw
|
|
| "bool" -> TBoolKw
|
|
| "void" -> TVoidKw
|
|
| "struct" -> TStructKw
|
|
| "if" -> TIf
|
|
| "else" -> TElse
|
|
| "for" -> TFor
|
|
| "each" -> TEach
|
|
| "foreach" -> TForEach
|
|
| "in" -> TIn
|
|
| "return" -> TReturn
|
|
| "true" -> TTrue
|
|
| "false" -> TFalse
|
|
| _ -> TIdent s
|
|
|
|
let lex (src : string) : token list =
|
|
let n = String.length src in
|
|
let rec skip_line_comment i =
|
|
if i >= n then i
|
|
else if src.[i] = '\n' then i + 1
|
|
else skip_line_comment (i + 1)
|
|
in
|
|
let rec skip_block_comment i =
|
|
if i + 1 >= n then raise (Lex_error "unterminated block comment")
|
|
else if src.[i] = '*' && src.[i + 1] = '/' then i + 2
|
|
else skip_block_comment (i + 1)
|
|
in
|
|
let rec read_number i j =
|
|
if j < n && is_digit src.[j] then read_number i (j + 1)
|
|
else
|
|
let s = String.sub src i (j - i) in
|
|
(TIntLit (int_of_string s), j)
|
|
in
|
|
let rec read_ident i j =
|
|
if j < n && is_ident_char src.[j] then read_ident i (j + 1)
|
|
else
|
|
let s = String.sub src i (j - i) in
|
|
(keyword_or_ident s, j)
|
|
in
|
|
let rec loop i acc =
|
|
if i >= n then List.rev (TEOF :: acc)
|
|
else if is_space src.[i] then loop (i + 1) acc
|
|
else
|
|
match src.[i] with
|
|
| '/' when i + 1 < n && src.[i + 1] = '/' -> loop (skip_line_comment (i + 2)) acc
|
|
| '/' when i + 1 < n && src.[i + 1] = '*' -> loop (skip_block_comment (i + 2)) acc
|
|
| '(' -> loop (i + 1) (TLParen :: acc)
|
|
| ')' -> loop (i + 1) (TRParen :: acc)
|
|
| '{' -> loop (i + 1) (TLBrace :: acc)
|
|
| '}' -> loop (i + 1) (TRBrace :: acc)
|
|
| '[' -> loop (i + 1) (TLBracket :: acc)
|
|
| ']' -> loop (i + 1) (TRBracket :: acc)
|
|
| ';' -> loop (i + 1) (TSemicolon :: acc)
|
|
| ',' -> loop (i + 1) (TComma :: acc)
|
|
| '.' -> loop (i + 1) (TDot :: acc)
|
|
| '+' -> loop (i + 1) (TPlus :: acc)
|
|
| '-' -> loop (i + 1) (TMinus :: acc)
|
|
| '*' -> loop (i + 1) (TStar :: acc)
|
|
| '%' -> loop (i + 1) (TPercent :: acc)
|
|
| '/' -> loop (i + 1) (TSlash :: acc)
|
|
| '!' when i + 1 < n && src.[i + 1] = '=' -> loop (i + 2) (TNe :: acc)
|
|
| '!' -> loop (i + 1) (TBang :: acc)
|
|
| '=' when i + 1 < n && src.[i + 1] = '=' -> loop (i + 2) (TEqEq :: acc)
|
|
| '=' -> loop (i + 1) (TAssign :: acc)
|
|
| '&' when i + 1 < n && src.[i + 1] = '&' -> loop (i + 2) (TAndAnd :: acc)
|
|
| '|' when i + 1 < n && src.[i + 1] = '|' -> loop (i + 2) (TOrOr :: acc)
|
|
| '<' when i + 1 < n && src.[i + 1] = '=' -> loop (i + 2) (TLe :: acc)
|
|
| '<' -> loop (i + 1) (TLt :: acc)
|
|
| '>' when i + 1 < n && src.[i + 1] = '=' -> loop (i + 2) (TGe :: acc)
|
|
| '>' -> loop (i + 1) (TGt :: acc)
|
|
| c when is_digit c ->
|
|
let tok, j = read_number i (i + 1) in
|
|
loop j (tok :: acc)
|
|
| c when is_ident_start c ->
|
|
let tok, j = read_ident i (i + 1) in
|
|
loop j (tok :: acc)
|
|
| c ->
|
|
let msg = Printf.sprintf "unexpected character: %c" c in
|
|
raise (Lex_error msg)
|
|
in
|
|
loop 0 []
|