This repository was archived by the owner on Jan 5, 2024. It is now read-only.
-
Notifications
You must be signed in to change notification settings - Fork 98
/
Copy pathparser.ts
253 lines (214 loc) · 7.19 KB
/
parser.ts
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
import Papa from 'papaparse';
import { ReadableWebToNodeStream } from 'readable-web-to-node-stream';
const BOM_CODE = 65279; // 0xFEFF
export interface CustomizablePapaParseConfig {
delimiter?: Papa.ParseConfig['delimiter'];
newline?: Papa.ParseConfig['newline'];
quoteChar?: Papa.ParseConfig['quoteChar'];
escapeChar?: Papa.ParseConfig['escapeChar'];
comments?: Papa.ParseConfig['comments'];
skipEmptyLines?: Papa.ParseConfig['skipEmptyLines'];
delimitersToGuess?: Papa.ParseConfig['delimitersToGuess'];
chunkSize?: Papa.ParseConfig['chunkSize'];
}
export interface PreviewReport {
file: File;
firstChunk: string;
firstRows: string[][]; // always PREVIEW_ROWS count
isSingleLine: boolean;
parseWarning?: Papa.ParseError;
}
// success/failure report from the preview parse attempt
export type PreviewResults<Report extends PreviewReport = PreviewReport> =
| {
parseError: Error | Papa.ParseError;
file: File;
}
| ({
parseError: undefined;
} & Report);
// complete "workspace" for kicking off the full parse @todo rename
export interface Preview extends PreviewReport {
papaParseConfig: CustomizablePapaParseConfig; // config that was used for preview parsing
hasHeaders: boolean;
}
export const PREVIEW_ROW_COUNT = 5;
export type FieldAssignmentMap = { [name: string]: number | undefined };
export type BaseRow = { [name: string]: unknown };
export type ParseCallback<Row extends BaseRow> = (
rows: Row[],
info: {
startIndex: number;
}
) => void | Promise<void>;
// polyfill as implemented in https://github.com/eligrey/Blob.js/blob/master/Blob.js#L653
// (this is for Safari pre v14.1)
function streamForBlob(blob: Blob) {
if (blob.stream) {
return blob.stream();
}
const res = new Response(blob);
if (res.body) {
return res.body;
}
throw new Error('This browser does not support client-side file reads');
}
export function parsePreview(
file: File,
customConfig: CustomizablePapaParseConfig
): Promise<PreviewResults> {
// wrap synchronous errors in promise
return new Promise<PreviewResults>((resolve) => {
let firstChunk: string | null = null;
let firstWarning: Papa.ParseError | undefined = undefined;
const rowAccumulator: string[][] = [];
function reportSuccess() {
// PapaParse normally complains first anyway, but might as well flag it
if (rowAccumulator.length === 0) {
return {
parseError: new Error('File is empty'),
file
};
}
// remember whether this file has only one line
const isSingleLine = rowAccumulator.length === 1;
// fill preview with blanks if needed
while (rowAccumulator.length < PREVIEW_ROW_COUNT) {
rowAccumulator.push([]);
}
resolve({
file,
parseError: undefined,
parseWarning: firstWarning || undefined,
firstChunk: firstChunk || '',
firstRows: rowAccumulator,
isSingleLine
});
}
// true streaming support for local files (@todo wait for upstream fix)
// @todo close the stream
const nodeStream = new ReadableWebToNodeStream(streamForBlob(file));
Papa.parse(nodeStream, {
...customConfig,
chunkSize: 10000, // not configurable, preview only
preview: PREVIEW_ROW_COUNT,
skipEmptyLines: true,
error: (error) => {
resolve({
parseError: error
});
},
beforeFirstChunk: (chunk) => {
firstChunk = chunk;
},
chunk: ({ data, errors }, parser) => {
// ignoring possible leading BOM
data.forEach((row) => {
rowAccumulator.push(
(row as unknown[]).map((item) =>
typeof item === 'string' ? item : ''
)
);
});
if (errors.length > 0 && !firstWarning) {
firstWarning = errors[0];
}
// finish parsing after first chunk
nodeStream.pause(); // parser does not pause source stream, do it here explicitly
parser.abort();
reportSuccess();
},
complete: reportSuccess
});
}).catch((error) => {
return {
parseError: error, // delegate message display to UI logic
file
};
});
}
export function processFile<Row extends BaseRow>(
preview: Preview,
fieldAssignments: FieldAssignmentMap,
reportProgress: (deltaCount: number) => void,
callback: ParseCallback<Row>
): Promise<void> {
const { file, hasHeaders, papaParseConfig } = preview;
const fieldNames = Object.keys(fieldAssignments);
// wrap synchronous errors in promise
return new Promise<void>((resolve, reject) => {
// skip first line if needed
let skipLine = hasHeaders;
let skipBOM = !hasHeaders;
let processedCount = 0;
// true streaming support for local files (@todo wait for upstream fix)
const nodeStream = new ReadableWebToNodeStream(streamForBlob(file));
Papa.parse(nodeStream, {
...papaParseConfig,
chunkSize: papaParseConfig.chunkSize || 10000, // our own preferred default
error: (error) => {
reject(error);
},
chunk: ({ data }, parser) => {
// pause to wait until the rows are consumed
nodeStream.pause(); // parser does not pause source stream, do it here explicitly
parser.pause();
const skipped = skipLine && data.length > 0;
const rows = (skipped ? data.slice(1) : data).map((row) => {
const stringRow = (row as unknown[]).map((item) =>
typeof item === 'string' ? item : ''
);
// perform BOM skip on first value
if (skipBOM && stringRow.length > 0) {
skipBOM = false;
stringRow[0] =
stringRow[0].charCodeAt(0) === BOM_CODE
? stringRow[0].substring(1)
: stringRow[0];
}
const record = {} as { [name: string]: string | undefined };
fieldNames.forEach((fieldName) => {
const columnIndex = fieldAssignments[fieldName];
if (columnIndex !== undefined) {
record[fieldName] = stringRow[columnIndex];
}
});
return record as Row; // @todo look into a more precise setup
});
// clear line skip flag if there was anything to skip
if (skipped) {
skipLine = false;
}
// info snapshot for processing callback
const info = {
startIndex: processedCount
};
processedCount += rows.length;
// @todo collect errors
reportProgress(rows.length);
// wrap sync errors in promise
// (avoid invoking callback if there are no rows to consume)
const whenConsumed = new Promise<void>((resolve) => {
const result = rows.length ? callback(rows, info) : undefined;
// introduce delay to allow a frame render
setTimeout(() => resolve(result), 0);
});
// unpause parsing when done
whenConsumed.then(
() => {
nodeStream.resume();
parser.resume();
},
() => {
// @todo collect errors
nodeStream.resume();
parser.resume();
}
);
},
complete: () => {
resolve();
}
});
});
}