Skip to content
This repository was archived by the owner on Sep 21, 2021. It is now read-only.

Commit 2aecf88

Browse files
Implement basic timestamp generator
1 parent 5a8ba31 commit 2aecf88

File tree

1 file changed

+45
-3
lines changed

1 file changed

+45
-3
lines changed

scripts/300_Aggregations/generate.py

+45-3
Original file line numberDiff line numberDiff line change
@@ -3,6 +3,8 @@
33
import json
44
import sys
55
import random
6+
import datetime
7+
import math
68

79
vendors = [
810
"Yellow",
@@ -114,6 +116,11 @@ def distance(start, end):
114116
return base + 0.2 * random.randint(0, max(base, 1))
115117

116118

119+
def duration(dist_miles):
120+
# 15 - 25 miles per hour on average
121+
return datetime.timedelta(hours=dist_miles / random.randint(15, 25))
122+
123+
117124
def fare(trip_distance):
118125
# loosely based on https://www.sfmta.com/getting-around/taxi/taxi-rates
119126
# assume a random waiting time up to 10% of the distance
@@ -126,27 +133,62 @@ def tip(fare_amount):
126133
# up to 20% tip
127134
return 0.2 * random.randint(0, round(fare_amount))
128135

136+
129137
def round_f(v):
130138
return float("{0:.2f}".format(v))
131139

132140

141+
def generate_timestamp(current):
142+
h = current.hour
143+
week_day = current.isoweekday()
144+
145+
hours_per_day = 24
146+
147+
peak_hour = 12
148+
max_difference_hours = hours_per_day - peak_hour
149+
150+
if week_day < 6:
151+
max_rides_per_hour = 1000
152+
min_rides_per_hour = 100
153+
elif week_day == 6:
154+
max_rides_per_hour = 800
155+
min_rides_per_hour = 200
156+
else:
157+
max_rides_per_hour = 600
158+
min_rides_per_hour = 50
159+
160+
diff_from_peak_hour = peak_hour - h if h <= peak_hour else h - peak_hour
161+
# vary the targeted rides per hour between [min_rides_per_hour; max_rides_per_hour] depending on difference to peak hour according to
162+
# a sine function to smooth it a bit.
163+
traffic_scale_factor = math.sin(0.5 * math.pi * (max_difference_hours - diff_from_peak_hour) / max_difference_hours)
164+
target_rides_this_hour = min_rides_per_hour + (max_rides_per_hour - min_rides_per_hour) * traffic_scale_factor
165+
166+
increment = random.expovariate(target_rides_this_hour) * 3600
167+
return current + datetime.timedelta(seconds=increment)
168+
169+
170+
def format_ts(ts):
171+
return ts.strftime("%Y-%m-%d %H:%M:%S")
172+
173+
133174
def main():
134175
if len(sys.argv) != 2:
135176
print("usage: %s number_of_records_to_generate" % sys.argv[0])
136177
exit(1)
137178

179+
current = datetime.datetime(year=2017, month=4, day=1)
138180
num_records = int(sys.argv[1])
139181
for i in range(num_records):
140182
record = {}
183+
current = generate_timestamp(current)
141184
record["vendor"] = vendor()
142-
# TODO: Find a simple but somewhat realistic model for daily / weekly patterns
143-
# record["pickup_datetime"] = pickup_datetime
144-
# record["dropoff_datetime"] = dropoff_datetime
185+
record["pickup_datetime"] = format_ts(current)
145186
record["passenger_count"] = passengers()
146187

147188
start = random.choice(zones)
148189
end = random.choice(zones)
149190
trip_distance = distance(start, end)
191+
record["dropoff_datetime"] = format_ts(current + duration(trip_distance))
150192

151193
record["pickup_zone"] = start
152194
record["dropoff_zone"] = end

0 commit comments

Comments
 (0)