forked from rescript-lang/rescript
-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathsedlexing.ml
288 lines (244 loc) · 8.32 KB
/
sedlexing.ml
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
(* The package sedlex is released under the terms of an MIT-like license. *)
(* See the attached LICENSE file. *)
(* Copyright 2005, 2013 by Alain Frisch and LexiFi. *)
exception InvalidCodepoint of int
exception MalFormed
(* Absolute position from the beginning of the stream *)
type apos = int
type lexbuf = {
refill: (Uchar.t array -> int -> int -> int);
mutable buf: Uchar.t array;
mutable len: int; (* Number of meaningful char in buffer *)
mutable offset: apos; (* Position of the first char in buffer
in the input stream *)
mutable pos: int; (* pos is the index in the buffer *)
mutable curr_bol: int; (* bol is the index in the input stream but not buffer *)
mutable curr_line: int; (* start from 1, if it is 0, we would not track postion info for you *)
mutable start_pos: int; (* First char we need to keep visible *)
mutable start_bol: int;
mutable start_line: int;
mutable marked_pos: int;
mutable marked_bol: int;
mutable marked_line: int;
mutable marked_val: int;
mutable filename: string;
mutable finished: bool;
}
let chunk_size = 512
let empty_lexbuf = {
refill = (fun _ _ _ -> assert false);
buf = [| |];
len = 0;
offset = 0;
pos = 0;
curr_bol = 0;
curr_line = 0;
start_pos = 0;
start_bol = 0;
start_line = 0;
marked_pos = 0;
marked_bol = 0;
marked_line = 0;
marked_val = 0;
filename = "";
finished = false;
}
(* let create f = {
empty_lexbuf with
refill = f;
buf = Array.make chunk_size (Uchar.of_int 0);
curr_line = 1;
}
let set_position lexbuf position =
lexbuf.offset <- position.Lexing.pos_cnum - lexbuf.pos;
lexbuf.curr_bol <- position.Lexing.pos_bol;
lexbuf.curr_line <- position.Lexing.pos_lnum
let set_filename lexbuf fname =
lexbuf.filename <- fname *)
(* let fill_buf_from_gen f gen buf pos len =
let rec aux i =
if i >= len then len
else match gen () with
| Some c -> buf.(pos + i) <- f c ; aux (i+1)
| None -> i
in
aux 0 *)
let from_int_array a =
let len = Array.length a in
{
empty_lexbuf with
buf = Array.init len (fun i -> Uchar.of_int a.(i));
len = len;
finished = true;
}
let refill lexbuf =
if lexbuf.len + chunk_size > Array.length lexbuf.buf
then begin
let s = lexbuf.start_pos in
let ls = lexbuf.len - s in
if ls + chunk_size <= Array.length lexbuf.buf then
Array.blit lexbuf.buf s lexbuf.buf 0 ls
else begin
let newlen = (Array.length lexbuf.buf + chunk_size) * 2 in
let newbuf = Array.make newlen (Uchar.of_int 0) in
Array.blit lexbuf.buf s newbuf 0 ls;
lexbuf.buf <- newbuf
end;
lexbuf.len <- ls;
lexbuf.offset <- lexbuf.offset + s;
lexbuf.pos <- lexbuf.pos - s;
lexbuf.marked_pos <- lexbuf.marked_pos - s;
lexbuf.start_pos <- 0
end;
let n = lexbuf.refill lexbuf.buf lexbuf.pos chunk_size in
if n = 0
then lexbuf.finished <- true
else lexbuf.len <- lexbuf.len + n
let new_line lexbuf =
if lexbuf.curr_line != 0 then
lexbuf.curr_line <- lexbuf.curr_line + 1;
lexbuf.curr_bol <- lexbuf.pos + lexbuf.offset
let next lexbuf =
if (not lexbuf.finished) && (lexbuf.pos = lexbuf.len) then refill lexbuf;
if lexbuf.finished && (lexbuf.pos = lexbuf.len) then None
else begin
let ret = lexbuf.buf.(lexbuf.pos) in
lexbuf.pos <- lexbuf.pos + 1;
if ret = (Uchar.of_int 10) then new_line lexbuf;
Some ret
end
let mark lexbuf i =
lexbuf.marked_pos <- lexbuf.pos;
lexbuf.marked_bol <- lexbuf.curr_bol;
lexbuf.marked_line <- lexbuf.curr_line;
lexbuf.marked_val <- i
let start lexbuf =
lexbuf.start_pos <- lexbuf.pos;
lexbuf.start_bol <- lexbuf.curr_bol;
lexbuf.start_line <- lexbuf.curr_line;
mark lexbuf (-1)
let backtrack lexbuf =
lexbuf.pos <- lexbuf.marked_pos;
lexbuf.curr_bol <- lexbuf.marked_bol;
lexbuf.curr_line <- lexbuf.marked_line;
lexbuf.marked_val
let rollback lexbuf =
lexbuf.pos <- lexbuf.start_pos;
lexbuf.curr_bol <- lexbuf.start_bol;
lexbuf.curr_line <- lexbuf.start_line
let lexeme_start lexbuf = lexbuf.start_pos + lexbuf.offset
let lexeme_end lexbuf = lexbuf.pos + lexbuf.offset
(* let loc lexbuf = (lexbuf.start_pos + lexbuf.offset, lexbuf.pos + lexbuf.offset) *)
let lexeme_length lexbuf = lexbuf.pos - lexbuf.start_pos
(* let sub_lexeme lexbuf pos len =
Array.sub lexbuf.buf (lexbuf.start_pos + pos) len *)
let lexeme lexbuf =
Array.sub lexbuf.buf (lexbuf.start_pos) (lexbuf.pos - lexbuf.start_pos)
(* let lexeme_char lexbuf pos =
lexbuf.buf.(lexbuf.start_pos + pos) *)
(* let lexing_positions lexbuf =
let start_p = {
Lexing.pos_fname = lexbuf.filename;
pos_lnum = lexbuf.start_line;
pos_cnum = lexbuf.start_pos + lexbuf.offset;
pos_bol = lexbuf.start_bol;
} and curr_p = {
Lexing.pos_fname = lexbuf.filename;
pos_lnum = lexbuf.curr_line;
pos_cnum = lexbuf.pos + lexbuf.offset;
pos_bol = lexbuf.curr_bol;
} in
(start_p, curr_p) *)
module Utf8 = struct
module Helper = struct
(* http://www.faqs.org/rfcs/rfc3629.html *)
let width = Array.make 256 (-1)
let () =
for i = 0 to 127 do width.(i) <- 1 done;
for i = 192 to 223 do width.(i) <- 2 done;
for i = 224 to 239 do width.(i) <- 3 done;
for i = 240 to 247 do width.(i) <- 4 done
let next s i =
match s.[i] with
| '\000'..'\127' as c ->
Char.code c
| '\192'..'\223' as c ->
let n1 = Char.code c in
let n2 = Char.code s.[i+1] in
if (n2 lsr 6 != 0b10) then raise MalFormed;
((n1 land 0x1f) lsl 6) lor (n2 land 0x3f)
| '\224'..'\239' as c ->
let n1 = Char.code c in
let n2 = Char.code s.[i+1] in
let n3 = Char.code s.[i+2] in
if (n2 lsr 6 != 0b10) || (n3 lsr 6 != 0b10) then raise MalFormed;
let p =
((n1 land 0x0f) lsl 12) lor ((n2 land 0x3f) lsl 6) lor (n3 land 0x3f)
in
if (p >= 0xd800) && (p <= 0xdf00) then raise MalFormed;
p
| '\240'..'\247' as c ->
let n1 = Char.code c in
let n2 = Char.code s.[i+1] in
let n3 = Char.code s.[i+2] in
let n4 = Char.code s.[i+3] in
if (n2 lsr 6 != 0b10) || (n3 lsr 6 != 0b10) || (n4 lsr 6 != 0b10)
then raise MalFormed;
((n1 land 0x07) lsl 18) lor ((n2 land 0x3f) lsl 12) lor
((n3 land 0x3f) lsl 6) lor (n4 land 0x3f)
| _ -> raise MalFormed
let compute_len s pos bytes =
let rec aux n i =
if i >= pos + bytes then if i = pos + bytes then n else raise MalFormed
else
let w = width.(Char.code s.[i]) in
if w > 0 then aux (succ n) (i + w)
else raise MalFormed
in
aux 0 pos
let rec blit_to_int s spos a apos n =
if n > 0 then begin
a.(apos) <- next s spos;
blit_to_int s (spos + width.(Char.code s.[spos])) a (succ apos) (pred n)
end
let to_int_array s pos bytes =
let n = compute_len s pos bytes in
let a = Array.make n 0 in
blit_to_int s pos a 0 n;
a
(**************************)
let store b p =
if p <= 0x7f then
Buffer.add_char b (Char.chr p)
else if p <= 0x7ff then (
Buffer.add_char b (Char.chr (0xc0 lor (p lsr 6)));
Buffer.add_char b (Char.chr (0x80 lor (p land 0x3f)))
)
else if p <= 0xffff then (
if (p >= 0xd800 && p < 0xe000) then raise MalFormed;
Buffer.add_char b (Char.chr (0xe0 lor (p lsr 12)));
Buffer.add_char b (Char.chr (0x80 lor ((p lsr 6) land 0x3f)));
Buffer.add_char b (Char.chr (0x80 lor (p land 0x3f)))
)
else if p <= 0x10ffff then (
Buffer.add_char b (Char.chr (0xf0 lor (p lsr 18)));
Buffer.add_char b (Char.chr (0x80 lor ((p lsr 12) land 0x3f)));
Buffer.add_char b (Char.chr (0x80 lor ((p lsr 6) land 0x3f)));
Buffer.add_char b (Char.chr (0x80 lor (p land 0x3f)))
)
else raise MalFormed
let from_uchar_array a apos len =
let b = Buffer.create (len * 4) in
let rec aux apos len =
if len > 0
then (store b (Uchar.to_int a.(apos)); aux (succ apos) (pred len))
else Buffer.contents b in
aux apos len
end
let from_string s =
from_int_array (Helper.to_int_array s 0 (String.length s))
let sub_lexeme lexbuf pos len =
Helper.from_uchar_array lexbuf.buf (lexbuf.start_pos + pos) len
let lexeme lexbuf =
sub_lexeme lexbuf 0 (lexbuf.pos - lexbuf.start_pos)
end