import json import os import shutil import numpy as np import pandas as pd # Hyperparameters dataset_name = 'METR-LA' data_file_path = f'datasets/raw_data/{dataset_name}/{dataset_name}.h5' graph_file_path = f'datasets/raw_data/{dataset_name}/adj_{dataset_name}.pkl' output_dir = f'datasets/{dataset_name}' target_channel = [0] # Target traffic flow channel add_time_of_day = True # Add time of day as a feature add_day_of_week = True # Add day of the week as a feature add_day_of_month = False # Add day of the month as a feature add_day_of_year = False # Add day of the year as a feature steps_per_day = 288 # Number of time steps per day frequency = 1440 // steps_per_day domain = 'traffic speed' feature_description = [domain, 'time of day', 'day of week'] regular_settings = { 'INPUT_LEN': 12, 'OUTPUT_LEN': 12, 'TRAIN_VAL_TEST_RATIO': [0.7, 0.1, 0.2], 'NORM_EACH_CHANNEL': False, 'RESCALE': True, 'METRICS': ['MAE', 'RMSE', 'MAPE'], 'NULL_VAL': 0.0 } def load_and_preprocess_data(): '''Load and preprocess raw data, selecting the specified channel(s).''' df = pd.read_hdf(data_file_path) data = np.expand_dims(df.values, axis=-1) data = data[..., target_channel] print(f'Raw time series shape: {data.shape}') return data, df def add_temporal_features(data, df): '''Add time of day and day of week as features to the data.''' _, n, _ = data.shape feature_list = [data] if add_time_of_day: time_of_day = (df.index.values - df.index.values.astype('datetime64[D]')) / np.timedelta64(1, 'D') time_of_day_tiled = np.tile(time_of_day, [1, n, 1]).transpose((2, 1, 0)) feature_list.append(time_of_day_tiled) if add_day_of_week: day_of_week = df.index.dayofweek / 7 day_of_week_tiled = np.tile(day_of_week, [1, n, 1]).transpose((2, 1, 0)) feature_list.append(day_of_week_tiled) if add_day_of_month: # numerical day_of_month day_of_month = (df.index.day - 1 ) / 31 # df.index.day starts from 1. We need to minus 1 to make it start from 0. day_of_month_tiled = np.tile(day_of_month, [1, n, 1]).transpose((2, 1, 0)) feature_list.append(day_of_month_tiled) if add_day_of_year: # numerical day_of_year day_of_year = (df.index.dayofyear - 1) / 366 # df.index.month starts from 1. We need to minus 1 to make it start from 0. day_of_year_tiled = np.tile(day_of_year, [1, n, 1]).transpose((2, 1, 0)) feature_list.append(day_of_year_tiled) data_with_features = np.concatenate(feature_list, axis=-1) # L x N x C return data_with_features def save_data(data): '''Save the preprocessed data to a binary file.''' if not os.path.exists(output_dir): os.makedirs(output_dir) file_path = os.path.join(output_dir, 'data.dat') fp = np.memmap(file_path, dtype='float32', mode='w+', shape=data.shape) fp[:] = data[:] fp.flush() del fp print(f'Data saved to {file_path}') def save_graph(): '''Save the adjacency matrix to the output directory.''' output_graph_path = os.path.join(output_dir, 'adj_mx.pkl') shutil.copyfile(graph_file_path, output_graph_path) print(f'Adjacency matrix saved to {output_graph_path}') def save_description(data): '''Save a description of the dataset to a JSON file.''' description = { 'name': dataset_name, 'domain': domain, 'shape': data.shape, 'num_time_steps': data.shape[0], 'num_nodes': data.shape[1], 'num_features': data.shape[2], 'feature_description': feature_description, 'has_graph': graph_file_path is not None, 'frequency (minutes)': frequency, 'regular_settings': regular_settings } description_path = os.path.join(output_dir, 'desc.json') with open(description_path, 'w') as f: json.dump(description, f, indent=4) print(f'Description saved to {description_path}') print(description) def main(): # Load and preprocess data data, df = load_and_preprocess_data() # Add temporal features data_with_features = add_temporal_features(data, df) # Save processed data save_data(data_with_features) # Copy and save adjacency matrix save_graph() # Save dataset description save_description(data_with_features) if __name__ == '__main__': main()