forked from rescript-lang/rescript
-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathext_utf8.ml
94 lines (86 loc) · 3.51 KB
/
ext_utf8.ml
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
(* Copyright (C) 2015-2016 Bloomberg Finance L.P.
*
* This program is free software: you can redistribute it and/or modify
* it under the terms of the GNU Lesser General Public License as published by
* the Free Software Foundation, either version 3 of the License, or
* (at your option) any later version.
*
* In addition to the permissions granted to you by the LGPL, you may combine
* or link a "work that uses the Library" with a publicly distributed version
* of this file to produce a combined library or application, then distribute
* that combined work under the terms of your choosing, with no requirement
* to comply with the obligations normally placed on you by section 4 of the
* LGPL version 3 (or the corresponding section of a later version of the LGPL
* should you choose to use a later version).
*
* This program is distributed in the hope that it will be useful,
* but WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
* GNU Lesser General Public License for more details.
*
* You should have received a copy of the GNU Lesser General Public License
* along with this program; if not, write to the Free Software
* Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA. *)
type byte = Single of int | Cont of int | Leading of int * int | Invalid
(** [classify chr] returns the {!byte} corresponding to [chr] *)
let classify chr =
let c = int_of_char chr in
(* Classify byte according to leftmost 0 bit *)
if c land 0b1000_0000 = 0 then Single c
else if (* c 0b0____*)
c land 0b0100_0000 = 0 then Cont (c land 0b0011_1111)
else if (* c 0b10___*)
c land 0b0010_0000 = 0 then Leading (1, c land 0b0001_1111)
else if (* c 0b110__*)
c land 0b0001_0000 = 0 then Leading (2, c land 0b0000_1111)
else if (* c 0b1110_ *)
c land 0b0000_1000 = 0 then Leading (3, c land 0b0000_0111)
else if (* c 0b1111_0___*)
c land 0b0000_0100 = 0 then Leading (4, c land 0b0000_0011)
else if (* c 0b1111_10__*)
c land 0b0000_0010 = 0 then Leading (5, c land 0b0000_0001)
(* c 0b1111_110__ *)
else Invalid
exception Invalid_utf8 of string
(* when the first char is [Leading],
TODO: need more error checking
when out of bond
*)
let rec follow s n (c : int) offset =
if n = 0 then (c, offset)
else
match classify s.[offset + 1] with
| Cont cc -> follow s (n - 1) ((c lsl 6) lor (cc land 0x3f)) (offset + 1)
| _ -> raise (Invalid_utf8 "Continuation byte expected")
let rec next s ~remaining offset =
if remaining = 0 then offset
else
match classify s.[offset + 1] with
| Cont _cc -> next s ~remaining:(remaining - 1) (offset + 1)
| _ -> -1
| exception _ -> -1
(* it can happen when out of bound *)
let decode_utf8_string s =
let lst = ref [] in
let add elem = lst := elem :: !lst in
let rec decode_utf8_cont s i s_len =
if i = s_len then ()
else
match classify s.[i] with
| Single c ->
add c;
decode_utf8_cont s (i + 1) s_len
| Cont _ -> raise (Invalid_utf8 "Unexpected continuation byte")
| Leading (n, c) ->
let c', i' = follow s n c i in
add c';
decode_utf8_cont s (i' + 1) s_len
| Invalid -> raise (Invalid_utf8 "Invalid byte")
in
decode_utf8_cont s 0 (String.length s);
List.rev !lst
(** To decode {j||j} we need verify in the ast so that we have better error
location, then we do the decode later
*)
(* let verify s loc =
assert false *)