forked from rescript-lang/rescript
-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathsedlexing.mli
170 lines (131 loc) · 7.22 KB
/
sedlexing.mli
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
(* The package sedlex is released under the terms of an MIT-like license. *)
(* See the attached LICENSE file. *)
(* Copyright 2005, 2013 by Alain Frisch and LexiFi. *)
(** Runtime support for lexers generated by [sedlex]. *)
(** This module is roughly equivalent to the module Lexing from the
OCaml standard library, except that its lexbuffers handle Unicode
code points (OCaml type: {!Uchar.t} in the range [0..0x10ffff])
instead of bytes (OCaml type: [char]).
It is possible to have sedlex-generated lexers work on a custom
implementation for lex buffers. To do this, define a module [L]
which implements the [start], [next], [mark] and [backtrack]
functions (See the Internal Interface section below for a
specification). They need not work on a type named [lexbuf]: you
can use the type name you want. Then, just do in your
sedlex-processed source, bind this module to the name [Sedlexing]
(for instance, with a local module definition: [let module Sedlexing
= L in ...].
Of course, you'll probably want to define functions like [lexeme] to
be used in the lexers semantic actions. *)
type lexbuf
(** The type of lexer buffers. A lexer buffer is the argument passed
to the scanning functions defined by the generated lexers.
The lexer buffer holds the internal information for the
scanners, including the code points of the token currently scanned,
its position from the beginning of the input stream,
and the current position of the lexer. *)
exception InvalidCodepoint of int
(** Raised by some functions to signal that some code point is not
compatible with a specified encoding. *)
exception MalFormed
(** Raised by functions in the [Utf8] and [Utf16] modules to report
strings which do not comply to the encoding. *)
(** {6 Creating generic lexbufs} *)
(* val create: (Uchar.t array -> int -> int -> int) -> lexbuf *)
(** Create a generic lexer buffer. When the lexer needs more
characters, it will call the given function, giving it an array of
Uchars [a], a position [pos] and a code point count [n]. The
function should put [n] code points or less in [a], starting at
position [pos], and return the number of characters provided. A
return value of 0 means end of input. *)
(* val set_position: lexbuf -> Lexing.position -> unit *)
(** set the initial tracked input position for [lexbuf].
If set to [Lexing.dummy_pos], Sedlexing will not track position
information for you. *)
(* val set_filename: lexbuf -> string -> unit *)
(** [set_filename lexbuf file] sets the filename to [file] in
[lexbuf]. It also sets the {!Lexing.pos_fname} field in
returned {!Lexing.position} records. *)
val from_int_array: int array -> lexbuf
(** Create a lexbuf from an array of Unicode code points. *)
(* val from_uchar_array: Uchar.t array -> lexbuf *)
(** Create a lexbuf from an array of Unicode code points. *)
(** {6 Interface for lexers semantic actions} *)
(** The following functions can be called from the semantic actions of
lexer definitions. They give access to the character string matched
by the regular expression associated with the semantic action. *)
val lexeme_start: lexbuf -> int
(** [Sedlexing.lexeme_start lexbuf] returns the offset in the
input stream of the first code point of the matched string.
The first code point of the stream has offset 0. *)
val lexeme_end: lexbuf -> int
(** [Sedlexing.lexeme_end lexbuf] returns the offset in the input
stream of the character following the last code point of the
matched string. The first character of the stream has offset
0. *)
(* val loc: lexbuf -> int * int *)
(** [Sedlexing.loc lexbuf] returns the pair
[(Sedlexing.lexeme_start lexbuf,Sedlexing.lexeme_end
lexbuf)]. *)
val lexeme_length: lexbuf -> int
(** [Sedlexing.lexeme_length lexbuf] returns the difference
[(Sedlexing.lexeme_end lexbuf) - (Sedlexing.lexeme_start
lexbuf)], that is, the length (in code points) of the matched
string. *)
(* val lexing_positions : lexbuf -> Lexing.position*Lexing.position *)
(** [Sedlexing.lexing_positions lexbuf] returns the start and end
positions of the current token, using a record of type
[Lexing.position]. This is intended for consumption
by parsers like those generated by [Menhir]. *)
(* val new_line: lexbuf -> unit *)
(** [Sedlexing.new_line lexbuf] increments the line count and
sets the beginning of line to the current position, as though
a newline character had been encountered in the input. *)
val lexeme: lexbuf -> Uchar.t array
(** [Sedlexing.lexeme lexbuf] returns the string matched by the
regular expression as an array of Unicode code point. *)
(* val lexeme_char: lexbuf -> int -> Uchar.t *)
(** [Sedlexing.lexeme_char lexbuf pos] returns code point number [pos] in
the matched string. *)
(* val sub_lexeme: lexbuf -> int -> int -> Uchar.t array *)
(** [Sedlexing.sub_lexeme lexbuf pos len] returns a substring of the string
matched by the regular expression as an array of Unicode code point. *)
val rollback: lexbuf -> unit
(** [Sedlexing.rollback lexbuf] puts [lexbuf] back in its configuration before
the last lexeme was matched. It is then possible to use another
lexer to parse the same characters again. The other functions
above in this section should not be used in the semantic action
after a call to [Sedlexing.rollback]. *)
(** {6 Internal interface} *)
(** These functions are used internally by the lexers. They could be used
to write lexers by hand, or with a lexer generator different from
[sedlex]. The lexer buffers have a unique internal slot that can store
an integer. They also store a "backtrack" position.
*)
val start: lexbuf -> unit
(** [start t] informs the lexer buffer that any
code points until the current position can be discarded.
The current position become the "start" position as returned
by [Sedlexing.lexeme_start]. Moreover, the internal slot is set to
[-1] and the backtrack position is set to the current position.
*)
val next: lexbuf -> Uchar.t option
(** [next lexbuf] extracts the next code point from the
lexer buffer and increments to current position. If the input stream
is exhausted, the function returns [None].
If a ['\n'] is encountered, the tracked line number is incremented. *)
val mark: lexbuf -> int -> unit
(** [mark lexbuf i] stores the integer [i] in the internal
slot. The backtrack position is set to the current position. *)
val backtrack: lexbuf -> int
(** [backtrack lexbuf] returns the value stored in the
internal slot of the buffer, and performs backtracking
(the current position is set to the value of the backtrack position). *)
module Utf8: sig
val from_string: string -> lexbuf
(** Create a lexbuf from a UTF-8 encoded string. *)
val lexeme: lexbuf -> string
(** As [Sedlexing.lexeme] with a result encoded in UTF-8. *)
val sub_lexeme: lexbuf -> int -> int -> string
(** As [Sedlexing.sub_lexeme] with a result encoded in UTF-8. *)
end