因為 OCaml lexer 預設沒有對 unicode 的處理方式,更合適的 lexer 程式庫是 sedlex,以下案例取自 https://github.com/ocaml-community/sedlex/blob/master/examples/repeat.ml

let rec token buf =
  match%sedlex buf with
    | white_space ->
        print_endline "\tWhitespace";
        token buf
    | 'a', Rep (white_space, 1) ->
        print_endline "a\n\tWhitespace";
        token buf
    | Rep ("bc", 2) ->
        print_endline "bcbc";
        token buf
    | Rep ("d", 1 .. 1) ->
        print_endline "d";
        token buf
    | Rep ("ef", 1 .. 3) ->
        Printf.printf "%s\n" (Sedlexing.Utf8.lexeme buf);
        token buf
    | eof -> print_endline "\tEnd"
    | any ->
        print_endline "Other";
        token buf
    | _ -> failwith "Internal failure: Reached impossible place"
 
let () =
  let lexbuf = Sedlexing.Utf8.from_string "a bcbc d ef efef efefef" in
  token lexbuf