124 lines
3.9 KiB
Python
124 lines
3.9 KiB
Python
from __future__ import absolute_import
|
|
from __future__ import division
|
|
from __future__ import print_function
|
|
from __future__ import unicode_literals
|
|
|
|
import argparse
|
|
import numpy as np
|
|
import os
|
|
import pandas as pd
|
|
|
|
|
|
def generate_graph_seq2seq_io_data(
|
|
df, x_offsets, y_offsets, add_time_in_day=True, add_day_in_week=False, scaler=None
|
|
):
|
|
"""
|
|
Generate samples from
|
|
:param df:
|
|
:param x_offsets:
|
|
:param y_offsets:
|
|
:param add_time_in_day:
|
|
:param add_day_in_week:
|
|
:param scaler:
|
|
:return:
|
|
# x: (epoch_size, input_length, num_nodes, input_dim)
|
|
# y: (epoch_size, output_length, num_nodes, output_dim)
|
|
"""
|
|
|
|
num_samples, num_nodes = df.shape
|
|
data = np.expand_dims(df.values, axis=-1)
|
|
data_list = [data]
|
|
if add_time_in_day:
|
|
time_ind = (df.index.values - df.index.values.astype("datetime64[D]")) / np.timedelta64(1, "D")
|
|
time_in_day = np.tile(time_ind, [1, num_nodes, 1]).transpose((2, 1, 0))
|
|
data_list.append(time_in_day)
|
|
if add_day_in_week:
|
|
day_in_week = np.zeros(shape=(num_samples, num_nodes, 7))
|
|
day_in_week[np.arange(num_samples), :, df.index.dayofweek] = 1
|
|
data_list.append(day_in_week)
|
|
|
|
data = np.concatenate(data_list, axis=-1)
|
|
# epoch_len = num_samples + min(x_offsets) - max(y_offsets)
|
|
x, y = [], []
|
|
# t is the index of the last observation.
|
|
min_t = abs(min(x_offsets))
|
|
max_t = abs(num_samples - abs(max(y_offsets))) # Exclusive
|
|
for t in range(min_t, max_t):
|
|
x_t = data[t + x_offsets, ...]
|
|
y_t = data[t + y_offsets, ...]
|
|
x.append(x_t)
|
|
y.append(y_t)
|
|
x = np.stack(x, axis=0)
|
|
y = np.stack(y, axis=0)
|
|
return x, y
|
|
|
|
|
|
def generate_train_val_test(args):
|
|
df = pd.read_hdf(args.traffic_df_filename)
|
|
# 0 is the latest observed sample.
|
|
x_offsets = np.sort(
|
|
# np.concatenate(([-week_size + 1, -day_size + 1], np.arange(-11, 1, 1)))
|
|
np.concatenate((np.arange(-11, 1, 1),))
|
|
)
|
|
# Predict the next one hour
|
|
y_offsets = np.sort(np.arange(1, 13, 1))
|
|
# x: (num_samples, input_length, num_nodes, input_dim)
|
|
# y: (num_samples, output_length, num_nodes, output_dim)
|
|
x, y = generate_graph_seq2seq_io_data(
|
|
df,
|
|
x_offsets=x_offsets,
|
|
y_offsets=y_offsets,
|
|
add_time_in_day=True,
|
|
add_day_in_week=False,
|
|
)
|
|
|
|
print("x shape: ", x.shape, ", y shape: ", y.shape)
|
|
# Write the data into npz file.
|
|
# num_test = 6831, using the last 6831 examples as testing.
|
|
# for the rest: 7/8 is used for training, and 1/8 is used for validation.
|
|
num_samples = x.shape[0]
|
|
num_test = round(num_samples * 0.2)
|
|
num_train = round(num_samples * 0.7)
|
|
num_val = num_samples - num_test - num_train
|
|
|
|
# train
|
|
x_train, y_train = x[:num_train], y[:num_train]
|
|
# val
|
|
x_val, y_val = (
|
|
x[num_train: num_train + num_val],
|
|
y[num_train: num_train + num_val],
|
|
)
|
|
# test
|
|
x_test, y_test = x[-num_test:], y[-num_test:]
|
|
|
|
for cat in ["train", "val", "test"]:
|
|
_x, _y = locals()["x_" + cat], locals()["y_" + cat]
|
|
print(cat, "x: ", _x.shape, "y:", _y.shape)
|
|
np.savez_compressed(
|
|
os.path.join(args.output_dir, "%s.npz" % cat),
|
|
x=_x,
|
|
y=_y,
|
|
x_offsets=x_offsets.reshape(list(x_offsets.shape) + [1]),
|
|
y_offsets=y_offsets.reshape(list(y_offsets.shape) + [1]),
|
|
)
|
|
|
|
|
|
def main(args):
|
|
print("Generating training data")
|
|
generate_train_val_test(args)
|
|
|
|
|
|
if __name__ == "__main__":
|
|
parser = argparse.ArgumentParser()
|
|
parser.add_argument(
|
|
"--output_dir", type=str, default="data/", help="Output directory."
|
|
)
|
|
parser.add_argument(
|
|
"--traffic_df_filename",
|
|
type=str,
|
|
default="data/metr-la.h5",
|
|
help="Raw traffic readings.",
|
|
)
|
|
args = parser.parse_args()
|
|
main(args)
|