-
Notifications
You must be signed in to change notification settings - Fork 47
/
Copy pathcreate_load_test_tables.py
109 lines (87 loc) · 3.28 KB
/
create_load_test_tables.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
# Copyright 2024 Google LLC
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# https://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
import math
import os
import pathlib
import sys
import google.cloud.bigquery as bigquery
REPO_ROOT = pathlib.Path(__file__).parent.parent
PROJECT_ID = os.getenv("GOOGLE_CLOUD_PROJECT")
if not PROJECT_ID:
print(
"Please set GOOGLE_CLOUD_PROJECT environment variable before running.",
file=sys.stderr,
)
sys.exit(1)
DATASET_ID = f"{PROJECT_ID}.load_testing"
TABLE_ID = f"{DATASET_ID}.scalars"
TABLE_ID_FORMAT = f"{DATASET_ID}.scalars_{{size}}"
KB_BYTES = 1000
MB_BYTES = 1000 * KB_BYTES
GB_BYTES = 1000 * MB_BYTES
TB_BYTES = 1000 * GB_BYTES
SIZES = (
("1mb", MB_BYTES),
("10mb", 10 * MB_BYTES),
("100mb", 100 * MB_BYTES),
("1gb", GB_BYTES),
("10gb", 10 * GB_BYTES),
("100gb", 100 * GB_BYTES),
("1tb", TB_BYTES),
)
SCHEMA_PATH = REPO_ROOT / "tests" / "data" / "scalars_schema.json"
DATA_PATH = REPO_ROOT / "tests" / "data" / "scalars.jsonl"
BQCLIENT = bigquery.Client()
def create_dataset():
dataset = bigquery.Dataset(DATASET_ID)
BQCLIENT.create_dataset(dataset, exists_ok=True)
def load_scalars_table():
schema = BQCLIENT.schema_from_json(SCHEMA_PATH)
job_config = bigquery.LoadJobConfig()
job_config.schema = schema
job_config.write_disposition = bigquery.WriteDisposition.WRITE_TRUNCATE
job_config.source_format = bigquery.SourceFormat.NEWLINE_DELIMITED_JSON
print(f"Creating {TABLE_ID}")
with open(DATA_PATH, "rb") as data_file:
BQCLIENT.load_table_from_file(
data_file,
TABLE_ID,
job_config=job_config,
).result()
def multiply_table(previous_table_id, target_table_id, multiplier):
clauses = [f"SELECT * FROM `{previous_table_id}`"] * multiplier
query = " UNION ALL ".join(clauses)
job_config = bigquery.QueryJobConfig()
job_config.destination = target_table_id
job_config.write_disposition = bigquery.WriteDisposition.WRITE_TRUNCATE
print(f"Creating {target_table_id}, {multiplier} x {previous_table_id}")
BQCLIENT.query_and_wait(query, job_config=job_config)
def create_tables():
base_table = BQCLIENT.get_table(TABLE_ID)
previous_bytes = base_table.num_bytes
previous_table_id = TABLE_ID
for table_suffix, target_bytes in SIZES:
# Make sure we exceed the desired bytes by adding to the multiplier.
multiplier = math.ceil(target_bytes / previous_bytes) + 1
target_table_id = TABLE_ID_FORMAT.format(size=table_suffix)
multiply_table(previous_table_id, target_table_id, multiplier)
table = BQCLIENT.get_table(target_table_id)
previous_bytes = table.num_bytes
previous_table_id = target_table_id
def main():
create_dataset()
load_scalars_table()
create_tables()
if __name__ == "__main__":
main()