forked from rescript-lang/rescript
-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathflow_sedlexing.ml
287 lines (257 loc) · 8.88 KB
/
flow_sedlexing.ml
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
(* The package sedlex is released under the terms of an MIT-like license. *)
(* See the attached LICENSE file. *)
(* Copyright 2005, 2013 by Alain Frisch and LexiFi. *)
external ( .!()<- ) : int array -> int -> int -> unit = "%array_unsafe_set"
external ( .!() ) : int array -> int -> int = "%array_unsafe_get"
external ( .![] ) : string -> int -> char = "%string_unsafe_get"
external ( .![]<- ) : bytes -> int -> char -> unit = "%bytes_unsafe_set"
exception InvalidCodepoint of int
exception MalFormed
(* Absolute position from the beginning of the stream *)
type apos = int
(* critical states:
[pos] [curr_bol] [curr_line]
The state of [curr_bol] and [curr_line] only changes when we hit a newline
[marked_pos] [marked_bol] [marked_line]
[start_pos] [start_bol] [start_line]
get reset whenever we get a new token
*)
type lexbuf = {
buf: int array;
(* Number of meaningful char in buffer *)
len: int;
(* pos is the index in the buffer *)
mutable pos: int;
(* bol is the index in the input stream but not buffer *)
mutable curr_bol: int;
(* start from 1, if it is 0, we would not track postion info for you *)
mutable curr_line: int;
(* First char we need to keep visible *)
mutable start_pos: int;
mutable start_bol: int;
mutable start_line: int;
mutable marked_pos: int;
mutable marked_bol: int;
mutable marked_line: int;
mutable marked_val: int;
}
let lexbuf_clone (x : lexbuf) : lexbuf =
{
buf = x.buf;
len = x.len;
pos = x.pos;
curr_bol = x.curr_bol;
curr_line = x.curr_line;
start_pos = x.start_pos;
start_bol = x.start_bol;
start_line = x.start_line;
marked_pos = x.marked_pos;
marked_bol = x.marked_bol;
marked_line = x.marked_line;
marked_val = x.marked_val;
}
let empty_lexbuf =
{
buf = [||];
len = 0;
pos = 0;
curr_bol = 0;
curr_line = 0;
start_pos = 0;
start_bol = 0;
start_line = 0;
marked_pos = 0;
marked_bol = 0;
marked_line = 0;
marked_val = 0;
}
let from_int_array a =
let len = Array.length a in
{ empty_lexbuf with buf = a; len }
let from_int_sub_array a len =
{ empty_lexbuf with buf = a; len }
let new_line lexbuf =
if lexbuf.curr_line != 0 then lexbuf.curr_line <- lexbuf.curr_line + 1;
lexbuf.curr_bol <- lexbuf.pos
let next lexbuf : Stdlib.Uchar.t option =
if lexbuf.pos = lexbuf.len then
None
else
let ret = lexbuf.buf.!(lexbuf.pos) in
lexbuf.pos <- lexbuf.pos + 1;
if ret = 10 then new_line lexbuf;
Some (Stdlib.Uchar.unsafe_of_int ret)
let __private__next_int lexbuf : int =
if lexbuf.pos = lexbuf.len then
-1
else
let ret = lexbuf.buf.!(lexbuf.pos) in
lexbuf.pos <- lexbuf.pos + 1;
if ret = 10 then new_line lexbuf;
ret
let mark lexbuf i =
lexbuf.marked_pos <- lexbuf.pos;
lexbuf.marked_bol <- lexbuf.curr_bol;
lexbuf.marked_line <- lexbuf.curr_line;
lexbuf.marked_val <- i
let start lexbuf =
lexbuf.start_pos <- lexbuf.pos;
lexbuf.start_bol <- lexbuf.curr_bol;
lexbuf.start_line <- lexbuf.curr_line;
mark lexbuf (-1)
let backtrack lexbuf =
lexbuf.pos <- lexbuf.marked_pos;
lexbuf.curr_bol <- lexbuf.marked_bol;
lexbuf.curr_line <- lexbuf.marked_line;
lexbuf.marked_val
let rollback lexbuf =
lexbuf.pos <- lexbuf.start_pos;
lexbuf.curr_bol <- lexbuf.start_bol;
lexbuf.curr_line <- lexbuf.start_line
let lexeme_start lexbuf = lexbuf.start_pos
let set_lexeme_start lexbuf pos = lexbuf.start_pos <- pos
let lexeme_end lexbuf = lexbuf.pos
let loc lexbuf = (lexbuf.start_pos , lexbuf.pos )
let lexeme_length lexbuf = lexbuf.pos - lexbuf.start_pos
let sub_lexeme lexbuf pos len = Array.sub lexbuf.buf (lexbuf.start_pos + pos) len
let lexeme lexbuf = Array.sub lexbuf.buf lexbuf.start_pos (lexbuf.pos - lexbuf.start_pos)
let current_code_point lexbuf = lexbuf.buf.(lexbuf.start_pos)
(* Decode UTF-8 encoded [s] into codepoints in [a], returning the length of the
* decoded string.
*
* To call this function safely:
* - ensure that [slen] is not greater than the length of [s]
* - ensure that [a] has enough capacity to hold the decoded value
*)
let unsafe_utf8_of_string (s : string) slen (a : int array) : int =
let spos = ref 0 in
let apos = ref 0 in
while !spos < slen do
let spos_code = s.![!spos] in
(match spos_code with
| '\000' .. '\127' as c ->
(* U+0000 - U+007F: 0xxxxxxx *)
a.!(!apos) <- Char.code c;
incr spos
| '\192' .. '\223' as c ->
(* U+0080 - U+07FF: 110xxxxx 10xxxxxx *)
let n1 = Char.code c in
let n2 = Char.code s.![!spos + 1] in
if n2 lsr 6 != 0b10 then raise MalFormed;
a.!(!apos) <- ((n1 land 0x1f) lsl 6) lor (n2 land 0x3f);
spos := !spos + 2
| '\224' .. '\239' as c ->
(* U+0800 - U+FFFF: 1110xxxx 10xxxxxx 10xxxxxx
U+D800 - U+DFFF are reserved for surrogate halves (RFC 3629) *)
let n1 = Char.code c in
let n2 = Char.code s.![!spos + 1] in
let n3 = Char.code s.![!spos + 2] in
let p = ((n1 land 0x0f) lsl 12) lor ((n2 land 0x3f) lsl 6) lor (n3 land 0x3f) in
if (n2 lsr 6 != 0b10 || n3 lsr 6 != 0b10) || (p >= 0xd800 && p <= 0xdfff) then raise MalFormed;
a.!(!apos) <- p;
spos := !spos + 3
| '\240' .. '\247' as c ->
(* U+10000 - U+1FFFFF: 11110xxx 10xxxxxx 10xxxxxx 10xxxxxx
> U+10FFFF are invalid (RFC 3629) *)
let n1 = Char.code c in
let n2 = Char.code s.![!spos + 1] in
let n3 = Char.code s.![!spos + 2] in
let n4 = Char.code s.![!spos + 3] in
if n2 lsr 6 != 0b10 || n3 lsr 6 != 0b10 || n4 lsr 6 != 0b10 then raise MalFormed;
let p =
((n1 land 0x07) lsl 18)
lor ((n2 land 0x3f) lsl 12)
lor ((n3 land 0x3f) lsl 6)
lor (n4 land 0x3f)
in
if p > 0x10ffff then raise MalFormed;
a.!(!apos) <- p;
spos := !spos + 4
| _ -> raise MalFormed);
incr apos
done;
!apos
(* Encode the decoded codepoints in [a] as UTF-8 into [b], returning the length
* of the encoded string.
*
* To call this function safely:
* - ensure that [offset + len] is not greater than the length of [a]
* - ensure that [b] has sufficient capacity to hold the encoded value
*)
let unsafe_string_of_utf8 (a : int array) ~(offset : int) ~(len : int) (b : bytes) : int =
let apos = ref offset in
let len = ref len in
let i = ref 0 in
while !len > 0 do
let u = a.!(!apos) in
if u < 0 then
raise MalFormed
else if u <= 0x007F then begin
b.![!i] <- Char.unsafe_chr u;
incr i
end else if u <= 0x07FF then (
b.![!i] <- Char.unsafe_chr (0xC0 lor (u lsr 6));
b.![!i + 1] <- Char.unsafe_chr (0x80 lor (u land 0x3F));
i := !i + 2
) else if u <= 0xFFFF then (
b.![!i] <- Char.unsafe_chr (0xE0 lor (u lsr 12));
b.![!i + 1] <- Char.unsafe_chr (0x80 lor ((u lsr 6) land 0x3F));
b.![!i + 2] <- Char.unsafe_chr (0x80 lor (u land 0x3F));
i := !i + 3
) else if u <= 0x10FFFF then (
b.![!i] <- Char.unsafe_chr (0xF0 lor (u lsr 18));
b.![!i + 1] <- Char.unsafe_chr (0x80 lor ((u lsr 12) land 0x3F));
b.![!i + 2] <- Char.unsafe_chr (0x80 lor ((u lsr 6) land 0x3F));
b.![!i + 3] <- Char.unsafe_chr (0x80 lor (u land 0x3F));
i := !i + 4
) else
raise MalFormed;
incr apos;
decr len
done;
!i
module Utf8 = struct
let from_string s =
let slen = String.length s in
let a = Array.make slen 0 in
let len = unsafe_utf8_of_string s slen a in
from_int_sub_array a len
let sub_lexeme lexbuf pos len : string =
let offset = lexbuf.start_pos + pos in
let b = Bytes.create (len * 4) in
let buf = lexbuf.buf in
(* Assertion needed, since we make use of unsafe API below *)
assert (offset + len <= Array.length buf);
let i = unsafe_string_of_utf8 buf ~offset ~len b in
Bytes.sub_string b 0 i
let lexeme lexbuf : string =
let offset = lexbuf.start_pos in
let len = lexbuf.pos - offset in
let b = Bytes.create (len * 4) in
let buf = lexbuf.buf in
let i = unsafe_string_of_utf8 buf ~offset ~len b in
Bytes.sub_string b 0 i
let lexeme_to_buffer lexbuf buffer : unit =
let offset = lexbuf.start_pos in
let len = lexbuf.pos - offset in
let b = Bytes.create (len * 4) in
let buf = lexbuf.buf in
let i = unsafe_string_of_utf8 buf ~offset ~len b in
Buffer.add_subbytes buffer b 0 i
let lexeme_to_buffer2 lexbuf buf1 buf2 : unit =
let offset = lexbuf.start_pos in
let len = lexbuf.pos - offset in
let b = Bytes.create (len * 4) in
let buf = lexbuf.buf in
let i = unsafe_string_of_utf8 buf ~offset ~len b in
Buffer.add_subbytes buf1 b 0 i;
Buffer.add_subbytes buf2 b 0 i
end
let string_of_utf8 (lexbuf : int array) : string =
let offset = 0 in
let len = Array.length lexbuf in
let b = Bytes.create (len * 4) in
let i = unsafe_string_of_utf8 lexbuf ~offset ~len b in
Bytes.sub_string b 0 i
let backoff lexbuf npos =
lexbuf.pos <- lexbuf.pos - npos