chore: initialize repo, add .gitignore, data scripts

This commit is contained in:
Auto Commit 2025-09-17 14:00:51 +00:00
parent 4c279cc747
commit 8372a7580c
48 changed files with 5789 additions and 75 deletions

66
.gitignore vendored Normal file
View File

@ -0,0 +1,66 @@
# Project artifacts
GPT-2/
datasets/
checkpoints/
log/
# Python
__pycache__/
*.pyc
*.pyo
*.pyd
.Python
.venv/
venv/
env/
.env
# Data & weights
*.npz
*.npy
*.pkl
*.pt
*.pth
*.ckpt
*.bin
# Logs
*.log
# VSCode
.vscode/
# JetBrains / IDEs
.idea/
.fleet/
.vs/
*.code-workspace
# OS files
.DS_Store
Thumbs.db
# Python build & test
build/
dist/
*.egg-info/
pip-wheel-metadata/
.pytest_cache/
.mypy_cache/
.ruff_cache/
.tox/
.nox/
.coverage
coverage.xml
htmlcov/
.cache/
# Editors swap/history
*.swp
*.swo
.history/
# Jupyter
.ipynb_checkpoints/

View File

@ -1,77 +1,31 @@
<div align="center">
<!-- <h1><b> Time-LLM </b></h1> -->
<!-- <h2><b> Time-LLM </b></h2> -->
<h2><b> (IJCAI'25) RePST: Language Model Empowered Spatio-Temporal Forecasting via Semantic-Oriented Reprogramming </b></h2>
</div>
RePST 修复版
准备GPT-2预训练权重
```bash
mkdir GPT-2
wget https://huggingface.co/openai-community/gpt2/resolve/main/config.json?download=true -O ./GPT-2/config.json
wget https://huggingface.co/openai-community/gpt2/resolve/main/pytorch_model.bin?download=true -O ./GPT-2/pytorch_model.bin
````
准备PEMS-BAY数据集按照[BasicTS](https://github.com/GestaltCogTeam/BasicTS/blob/master/tutorial/dataset_design.md)方法准备
[Google Drive](https://drive.google.com/drive/folders/14EJVODCU48fGK0FkyeVom_9lETh80Yjp?usp=sharing) 可使用gdown下载。
解压后,确保 `./datasets/PEMS-BAY` 文件夹内具有 `adj_mx.pkl, data.dat, desc,json文件` 然后运行脚本
```bash
python prepare_pems_bay.py
````
---
>
> 🙋 Please let us know if you find out a mistake or have any suggestions!
>
> 🐝 The full version of this paper can be accessed at https://arxiv.org/abs/2408.14505.
>
> 🌟 If you find this resource helpful, please consider to star this repository and cite our research:
在PEMS-BAY数据集文件夹下生成 `train.npz, val.npz, test.npz`
```
@inproceedings{wang2025repst,
title={RePST: Language Model Empowered Spatio-Temporal Forecasting via Semantic-Oriented Reprogramming},
author={Wang, Hao and Han, Jindong and Fan, Wei and Sun, Leilei and Liu, Hao},
booktitle={Proceedings of the 34th International Joint Conference on Artificial Intelligence},
year={2025}
}
```
根据BasicTS仓库配置BasicTS环境亦或是使用
`pip install -r requirement.txt`
## Introduction
This repository contains the implementation of REPST, a framework for spatio-temporal forecasting that leverages the reasoning and generalization capabilities of Pre-trained Language Models (PLMs). REPST utilizes a semantic-aware spatio-temporal decomposer and selective discrete reprogramming to enable PLMs to handle complex spatio-temporal data, especially in data-scarce environments.
我是直接使用现有的BasicTS环境因此没有做过测试
<p align="center">
<img src="./figures/repst.png" height = "360" alt="" align=center />
</p>
- RePST comprises two key components: (1) a dynamic mode decomposition approach that disentangles spatially correlated time series into interpretable components, and (2) an expanded spatio-temporal vocabulary that helps PLMs better understand the dynamics of complex spatio-temporal systems, to guide PLM reasoning.
<p align="center">
<img src="./figures/method-detailed-illustration.png" height = "190" alt="" align=center />
</p>
## Requirements
Use python 3.11 from MiniConda
- torch==2.0.1
- accelerate==0.28.0
- einops==0.6.0
- matplotlib==3.7.0
- numpy==1.24.4
- pandas==2.1.4
- scikit_learn==1.3.2
- scipy==1.11.4
- tqdm==4.66.1
- transformers==4.36.2
To install all dependencies:
```
pip install -r requirements.txt
```
## Datasets
# Pending
You can access the well pre-processed datasets from [[Google Drive]](https://drive.google.com/), then place the downloaded contents under `./dataset`
## Detailed usage
Please refer to ```run.py``` for the detailed description of each hyperparameter.
## Acknowledgement
Our baseline model implementation adapts [BasicTS](https://github.com/GestaltCogTeam/BasicTS) as the code base and have extensively modified it to our purposes. We thank the authors for sharing their implementations and related resources.
开跑
```python
python run.py --root_path datasets --data_path PEMS-BAY --device cuda:0 --seq_len 12 --pred_len 12
```

View File

@ -2,6 +2,7 @@ import torch
import torch.nn as nn
import torch.nn.functional as F
from torch import Tensor
from math import sqrt
class ReplicationPad1d(nn.Module):
@ -20,7 +21,7 @@ class TokenEmbedding(nn.Module):
padding = 1
self.tokenConv = nn.Conv1d(in_channels=c_in, out_channels=d_model,
kernel_size=3, padding=padding, padding_mode='circular', bias=False)
self.confusion_layer = nn.Linear(12, 1)
self.confusion_layer = nn.LazyLinear(1)
# if air_quality
# self.confusion_layer = nn.Linear(42, 1)
@ -31,8 +32,8 @@ class TokenEmbedding(nn.Module):
m.weight, mode='fan_in', nonlinearity='leaky_relu')
def forward(self, x):
b, n, m, pn, pl = x.shape
x = self.tokenConv(x.reshape(b*n, pl, m*pn))
b, n, m, pn, pl = x.shape # batch, node, feature, patch_num, patch_len
x = self.tokenConv(x.reshape(b*n, pl, m*pn)) # batch*node, patch_len, feature*patch_num
x = self.confusion_layer(x)
return x.reshape(b, n, -1)

View File

@ -7,8 +7,8 @@ from transformers.models.gpt2.modeling_gpt2 import GPT2Model
from transformers import GPT2Model, GPT2Config
from einops import rearrange
from reprogramming import *
from normalizer import *
from .reprogramming import *
from .normalizer import *
class repst(nn.Module):
@ -34,9 +34,6 @@ class repst(nn.Module):
self.patch_embedding = PatchEmbedding(self.d_model, self.patch_len, self.stride, self.dropout)
self.gpts = GPT2Model.from_pretrained('./GPT-2', output_attentions=True, output_hidden_states=True)
self.gpts.h = self.gpts.h[:self.gpt_layers]

115
prepare_pems_bay.py Normal file
View File

@ -0,0 +1,115 @@
import os
import json
import argparse
import numpy as np
def generate_offsets(seq_length_x: int, seq_length_y: int):
x_offsets = np.sort(np.concatenate((np.arange(-(seq_length_x - 1), 1, 1),)))
y_offsets = np.sort(np.arange(1, seq_length_y + 1, 1))
return x_offsets, y_offsets
def make_sliding_windows(data: np.ndarray, x_offsets: np.ndarray, y_offsets: np.ndarray):
# data: (T, N, C)
num_samples = data.shape[0]
min_t = abs(int(np.min(x_offsets)))
max_t = num_samples - int(np.max(y_offsets))
x, y = [], []
for t in range(min_t, max_t):
x.append(data[t + x_offsets, ...]) # (seq_len, N, C)
y.append(data[t + y_offsets, ...]) # (pred_len, N, C)
x = np.stack(x, axis=0).astype(np.float32) # (S, seq_len, N, C)
y = np.stack(y, axis=0).astype(np.float32) # (S, pred_len, N, C)
# Reorder to (S, N, L, C) to match model expectation: b n l m
x = np.transpose(x, (0, 2, 1, 3))
y = np.transpose(y, (0, 2, 1, 3))
return x, y
def split_by_ratio(x: np.ndarray, y: np.ndarray, ratios):
r_train, r_val, r_test = ratios
num_samples = x.shape[0]
n_train = int(round(num_samples * r_train))
n_val = int(round(num_samples * r_val))
n_test = num_samples - n_train - n_val
x_train, y_train = x[:n_train], y[:n_train]
x_val, y_val = x[n_train:n_train + n_val], y[n_train:n_train + n_val]
x_test, y_test = x[-n_test:], y[-n_test:]
return (x_train, y_train), (x_val, y_val), (x_test, y_test)
def main():
parser = argparse.ArgumentParser(description="Prepare PEMS-BAY to train/val/test .npz")
parser.add_argument("--dataset_dir", type=str, default='./datasets/PEMS-BAY', help="Path to datasets/PEMS-BAY directory")
parser.add_argument("--seq_len", type=int, default=12)
parser.add_argument("--pred_len", type=int, default=12)
parser.add_argument("--speed_channel_only", action="store_true", help="Use only the first channel (speed)")
args = parser.parse_args()
dataset_dir = args.dataset_dir
desc_path = os.path.join(dataset_dir, "desc.json")
data_path = os.path.join(dataset_dir, "data.dat")
if not os.path.exists(desc_path):
raise FileNotFoundError(f"desc.json not found at {desc_path}")
if not os.path.exists(data_path):
raise FileNotFoundError(f"data.dat not found at {data_path}")
with open(desc_path, "r") as f:
desc = json.load(f)
shape = desc.get("shape") # expected [T, N, C]
if not shape or len(shape) not in (2, 3):
raise ValueError(f"Invalid shape in desc.json: {shape}")
total_elems = int(np.prod(shape)) if len(shape) == 3 else int(np.prod(shape) * 1)
raw = np.fromfile(data_path, dtype=np.float32)
if raw.size != total_elems:
# Try infer last dim as 1 if desc has 2 dims
if len(shape) == 2 and raw.size == shape[0] * shape[1]:
pass
else:
raise ValueError(f"data.dat size mismatch. desc={shape}, fromfile={raw.size}")
if len(shape) == 3:
data = raw.reshape(shape)
else:
data = raw.reshape(shape + [1]) # (T, N, 1)
# Use only speed channel for this model (expects C=1)
if data.shape[-1] > 1:
data = data[..., :1]
x_offsets, y_offsets = generate_offsets(args.seq_len, args.pred_len)
x, y = make_sliding_windows(data, x_offsets, y_offsets)
ratios = desc.get("regular_settings", {}).get("TRAIN_VAL_TEST_RATIO", [0.7, 0.1, 0.2])
(x_train, y_train), (x_val, y_val), (x_test, y_test) = split_by_ratio(x, y, ratios)
for split_name, _x, _y in (
("train", x_train, y_train),
("val", x_val, y_val),
("test", x_test, y_test),
):
out_path = os.path.join(dataset_dir, f"{split_name}.npz")
np.savez_compressed(
out_path,
x=_x,
y=_y,
x_offsets=x_offsets.reshape(list(x_offsets.shape) + [1]),
y_offsets=y_offsets.reshape(list(y_offsets.shape) + [1]),
)
print(f"Saved {split_name} -> {out_path} | x={_x.shape}, y={_y.shape}")
print("Done.")
if __name__ == "__main__":
main()

View File

@ -0,0 +1,45 @@
import os
from argparse import ArgumentParser
import numpy as np
from tqdm import tqdm
data_dir_path = 'datasets/BLAST/train'
def main(clean_cache=False):
num_samples = 0
for i in range(99):
shape = tuple(np.load(data_dir_path + f'/shape_{i}_99.npy'))
N, L = shape
num_samples += N
merged_data = np.memmap(data_dir_path + '/data.dat', mode='w+', dtype=np.float32, shape=(num_samples, L))
print('Merging data...')
current_index = 0
for i in tqdm(range(99)):
shape = tuple(np.load(data_dir_path + f'/shape_{i}_99.npy'))
data = np.memmap(data_dir_path + f'/data_{i}_99.dat', mode='r', dtype=np.float32, shape=shape)
merged_data[current_index:current_index + shape[0]] = data
current_index += shape[0]
shape = merged_data.shape
np.save(data_dir_path + '/shape.npy', shape)
print('Data merged successfully.')
if clean_cache:
print('Cleaning cache...')
for i in tqdm(range(99)):
os.remove(data_dir_path + f'/data_{i}_99.dat')
os.remove(data_dir_path + f'/shape_{i}_99.npy')
print('Cache cleaned.')
def parse_args():
parser = ArgumentParser(description='Merge data files into a single memmap file.')
parser.add_argument('--clean_cache', default=True, help='Clean cache after merging.')
return parser.parse_args()
if __name__ == '__main__':
args = parse_args()
main(clean_cache=args.clean_cache)

View File

@ -0,0 +1,103 @@
import json
import os
import numpy as np
import pandas as pd
# Hyperparameters
dataset_name = 'BeijingAirQuality'
data_file_path = f'datasets/raw_data/{dataset_name}/{dataset_name}.xlsx'
graph_file_path = None
output_dir = f'datasets/{dataset_name}'
target_channel = [0] # Target traffic flow channel
add_time_of_day = True # Add time of day as a feature
add_day_of_week = True # Add day of the week as a feature
steps_per_day = 24 # Number of time steps per day
frequency = 1440 // steps_per_day
domain = 'Beijing air quality'
feature_description = [domain, 'time of day', 'day of week']
regular_settings = {
'INPUT_LEN': 336,
'OUTPUT_LEN': 336,
'TRAIN_VAL_TEST_RATIO': [0.6, 0.2, 0.2],
'NORM_EACH_CHANNEL': True,
'RESCALE': False,
'METRICS': ['MAE', 'MSE'],
'NULL_VAL': np.nan
}
def load_and_preprocess_data():
'''Load and preprocess raw data, selecting the specified channel(s).'''
df = pd.read_excel(data_file_path)
data = df.values
colums = df.columns
data = np.expand_dims(df.values, axis=-1)
data = data[..., target_channel]
print(f'Raw time series shape: {data.shape}')
print('Columns: {0}'.format(colums))
return data
def add_temporal_features(data):
'''Add time of day and day of week as features to the data.'''
l, n, _ = data.shape
feature_list = [data]
if add_time_of_day:
time_of_day = np.array([i % steps_per_day / steps_per_day for i in range(l)])
time_of_day_tiled = np.tile(time_of_day, [1, n, 1]).transpose((2, 1, 0))
feature_list.append(time_of_day_tiled)
if add_day_of_week:
day_of_week = np.array([(i // steps_per_day) % 7 / 7 for i in range(l)])
day_of_week_tiled = np.tile(day_of_week, [1, n, 1]).transpose((2, 1, 0))
feature_list.append(day_of_week_tiled)
data_with_features = np.concatenate(feature_list, axis=-1) # L x N x C
return data_with_features
def save_data(data):
'''Save the preprocessed data to a binary file.'''
if not os.path.exists(output_dir):
os.makedirs(output_dir)
file_path = os.path.join(output_dir, 'data.dat')
fp = np.memmap(file_path, dtype='float32', mode='w+', shape=data.shape)
fp[:] = data[:]
fp.flush()
del fp
print(f'Data saved to {file_path}')
def save_description(data):
'''Save a description of the dataset to a JSON file.'''
description = {
'name': dataset_name,
'domain': domain,
'shape': data.shape,
'num_time_steps': data.shape[0],
'num_nodes': data.shape[1],
'num_features': data.shape[2],
'feature_description': feature_description,
'has_graph': graph_file_path is not None,
'frequency (minutes)': frequency,
'regular_settings': regular_settings
}
description_path = os.path.join(output_dir, 'desc.json')
with open(description_path, 'w') as f:
json.dump(description, f, indent=4)
print(f'Description saved to {description_path}')
print(description)
def main():
# Load and preprocess data
data = load_and_preprocess_data()
# Add temporal features
data_with_features = add_temporal_features(data)
# Save processed data
save_data(data_with_features)
# Save dataset description
save_description(data_with_features)
if __name__ == '__main__':
main()

View File

@ -0,0 +1,136 @@
import json
import os
import pickle
import shutil
import numpy as np
import pandas as pd
# Hyperparameters
dataset_name = 'CA'
data_file_path = f'datasets/raw_data/{dataset_name}/{dataset_name}.h5'
graph_file_path = f'datasets/raw_data/{dataset_name}/adj_{dataset_name}.npy'
meta_file_path = f'datasets/raw_data/{dataset_name}/meta_{dataset_name}.csv'
output_dir = f'datasets/{dataset_name}'
target_channel = [0] # Target traffic flow channel
add_time_of_day = True # Add time of day as a feature
add_day_of_week = True # Add day of the week as a feature
add_day_of_month = False # Add day of the month as a feature
add_day_of_year = False # Add day of the year as a feature
steps_per_day = 96 # Number of time steps per day
frequency = 1440 // steps_per_day
domain = 'traffic flow'
feature_description = [domain, 'time of day', 'day of week']
regular_settings = {
'INPUT_LEN': 12,
'OUTPUT_LEN': 12,
'TRAIN_VAL_TEST_RATIO': [0.6, 0.2, 0.2],
'NORM_EACH_CHANNEL': False,
'RESCALE': True,
'METRICS': ['MAE', 'RMSE', 'MAPE'],
'NULL_VAL': 0.0
}
def load_and_preprocess_data():
'''Load and preprocess raw data, selecting the specified channel(s).'''
df = pd.read_hdf(data_file_path)
data = np.expand_dims(df.values, axis=-1)
data = data[..., target_channel]
print(f'Raw time series shape: {data.shape}')
return data, df
def add_temporal_features(data, df):
'''Add time of day and day of week as features to the data.'''
_, n, _ = data.shape
feature_list = [data]
if add_time_of_day:
time_of_day = (df.index.values - df.index.values.astype('datetime64[D]')) / np.timedelta64(1, 'D')
time_of_day_tiled = np.tile(time_of_day, [1, n, 1]).transpose((2, 1, 0))
feature_list.append(time_of_day_tiled)
if add_day_of_week:
day_of_week = df.index.dayofweek / 7
day_of_week_tiled = np.tile(day_of_week, [1, n, 1]).transpose((2, 1, 0))
feature_list.append(day_of_week_tiled)
if add_day_of_month:
# numerical day_of_month
day_of_month = (df.index.day - 1 ) / 31 # df.index.day starts from 1. We need to minus 1 to make it start from 0.
day_of_month_tiled = np.tile(day_of_month, [1, n, 1]).transpose((2, 1, 0))
feature_list.append(day_of_month_tiled)
if add_day_of_year:
# numerical day_of_year
day_of_year = (df.index.dayofyear - 1) / 366 # df.index.month starts from 1. We need to minus 1 to make it start from 0.
day_of_year_tiled = np.tile(day_of_year, [1, n, 1]).transpose((2, 1, 0))
feature_list.append(day_of_year_tiled)
data_with_features = np.concatenate(feature_list, axis=-1) # L x N x C
return data_with_features
def save_data(data):
'''Save the preprocessed data to a binary file.'''
if not os.path.exists(output_dir):
os.makedirs(output_dir)
file_path = os.path.join(output_dir, 'data.dat')
fp = np.memmap(file_path, dtype='float32', mode='w+', shape=data.shape)
fp[:] = data[:]
fp.flush()
del fp
print(f'Data saved to {file_path}')
def save_graph():
'''Save the adjacency matrix to the output directory.'''
output_graph_path = os.path.join(output_dir, 'adj_mx.pkl')
adj_mx = np.load(graph_file_path)
with open(output_dir + '/adj_mx.pkl', 'wb') as f:
pickle.dump(adj_mx, f)
print(f'Adjacency matrix saved to {output_graph_path}')
def save_meta_data():
'''Save the meta data to the output directory'''
output_meta_data_path = os.path.join(output_dir, 'meta.csv')
shutil.copyfile(meta_file_path, output_meta_data_path)
def save_description(data):
'''Save a description of the dataset to a JSON file.'''
description = {
'name': dataset_name,
'domain': domain,
'shape': data.shape,
'num_time_steps': data.shape[0],
'num_nodes': data.shape[1],
'num_features': data.shape[2],
'feature_description': feature_description,
'has_graph': graph_file_path is not None,
'frequency (minutes)': frequency,
'regular_settings': regular_settings
}
description_path = os.path.join(output_dir, 'desc.json')
with open(description_path, 'w') as f:
json.dump(description, f, indent=4)
print(f'Description saved to {description_path}')
print(description)
def main():
# Load and preprocess data
data, df = load_and_preprocess_data()
# Add temporal features
data_with_features = add_temporal_features(data, df)
# Save processed data
save_data(data_with_features)
# Copy and save adjacency matrix
save_graph()
# Copy and save meta data
save_meta_data()
# Save dataset description
save_description(data_with_features)
if __name__ == '__main__':
main()

View File

@ -0,0 +1,121 @@
import json
import os
import numpy as np
import pandas as pd
# Hyperparameters
dataset_name = 'ETTh1'
data_file_path = f'datasets/raw_data/{dataset_name}/{dataset_name}.csv'
graph_file_path = None
output_dir = f'datasets/{dataset_name}'
target_channel = [0] # Target traffic flow channel
add_time_of_day = True # Add time of day as a feature
add_day_of_week = True # Add day of the week as a feature
add_day_of_month = True # Add day of the month as a feature
add_day_of_year = True # Add day of the year as a feature
steps_per_day = 24 # Number of time steps per day
frequency = 1440 // steps_per_day
domain = 'electricity transformer temperature'
feature_description = [domain, 'time of day', 'day of week', 'day of month', 'day of year']
regular_settings = {
'INPUT_LEN': 336,
'OUTPUT_LEN': 336,
'TRAIN_VAL_TEST_RATIO': [0.6, 0.2, 0.2],
'NORM_EACH_CHANNEL': True,
'RESCALE': False,
'METRICS': ['MAE', 'MSE'],
'NULL_VAL': np.nan
}
def load_and_preprocess_data():
'''Load and preprocess raw data, selecting the specified channel(s).'''
df = pd.read_csv(data_file_path)
df = df.iloc[:20*30*24]
df_index = pd.to_datetime(df['date'].values, format='%Y-%m-%d %H:%M:%S').to_numpy()
df = df[df.columns[1:]]
df.index = df_index
data = np.expand_dims(df.values, axis=-1)
data = data[..., target_channel]
print(f'Raw time series shape: {data.shape}')
return data, df
def add_temporal_features(data, df):
'''Add time of day and day of week as features to the data.'''
l, n, _ = data.shape
feature_list = [data]
if add_time_of_day:
# numerical time_of_day
tod = [i % steps_per_day / steps_per_day for i in range(l)]
tod = np.array(tod)
tod_tiled = np.tile(tod, [1, n, 1]).transpose((2, 1, 0))
feature_list.append(tod_tiled)
if add_day_of_week:
# numerical day_of_week
dow = df.index.dayofweek / 7
dow_tiled = np.tile(dow, [1, n, 1]).transpose((2, 1, 0))
feature_list.append(dow_tiled)
if add_day_of_month:
# numerical day_of_month
dom = (df.index.day - 1) / 31 # df.index.day starts from 1. We need to minus 1 to make it start from 0.
dom_tiled = np.tile(dom, [1, n, 1]).transpose((2, 1, 0))
feature_list.append(dom_tiled)
if add_day_of_year:
# numerical day_of_year
doy = (df.index.dayofyear - 1) / 366 # df.index.month starts from 1. We need to minus 1 to make it start from 0.
doy_tiled = np.tile(doy, [1, n, 1]).transpose((2, 1, 0))
feature_list.append(doy_tiled)
data_with_features = np.concatenate(feature_list, axis=-1) # L x N x C
return data_with_features
def save_data(data):
'''Save the preprocessed data to a binary file.'''
if not os.path.exists(output_dir):
os.makedirs(output_dir)
file_path = os.path.join(output_dir, 'data.dat')
fp = np.memmap(file_path, dtype='float32', mode='w+', shape=data.shape)
fp[:] = data[:]
fp.flush()
del fp
print(f'Data saved to {file_path}')
def save_description(data):
'''Save a description of the dataset to a JSON file.'''
description = {
'name': dataset_name,
'domain': domain,
'shape': data.shape,
'num_time_steps': data.shape[0],
'num_nodes': data.shape[1],
'num_features': data.shape[2],
'feature_description': feature_description,
'has_graph': graph_file_path is not None,
'frequency (minutes)': frequency,
'regular_settings': regular_settings,
}
description_path = os.path.join(output_dir, 'desc.json')
with open(description_path, 'w') as f:
json.dump(description, f, indent=4)
print(f'Description saved to {description_path}')
print(description)
def main():
# Load and preprocess data
data, df = load_and_preprocess_data()
# Add temporal features
data_with_features = add_temporal_features(data, df)
# Save processed data
save_data(data_with_features)
# Save dataset description
save_description(data_with_features)
if __name__ == '__main__':
main()

View File

@ -0,0 +1,121 @@
import json
import os
import numpy as np
import pandas as pd
# Hyperparameters
dataset_name = 'ETTh2'
data_file_path = f'datasets/raw_data/{dataset_name}/{dataset_name}.csv'
graph_file_path = None
output_dir = f'datasets/{dataset_name}'
target_channel = [0] # Target traffic flow channel
add_time_of_day = True # Add time of day as a feature
add_day_of_week = True # Add day of the week as a feature
add_day_of_month = True # Add day of the month as a feature
add_day_of_year = True # Add day of the year as a feature
steps_per_day = 24 # Number of time steps per day
frequency = 1440 // steps_per_day
domain = 'electricity transformer temperature'
feature_description = [domain, 'time of day', 'day of week', 'day of month', 'day of year']
regular_settings = {
'INPUT_LEN': 336,
'OUTPUT_LEN': 336,
'TRAIN_VAL_TEST_RATIO': [0.6, 0.2, 0.2],
'NORM_EACH_CHANNEL': True,
'RESCALE': False,
'METRICS': ['MAE', 'MSE'],
'NULL_VAL': np.nan
}
def load_and_preprocess_data():
'''Load and preprocess raw data, selecting the specified channel(s).'''
df = pd.read_csv(data_file_path)
df = df.iloc[:20*30*24]
df_index = pd.to_datetime(df['date'].values, format='%Y-%m-%d %H:%M:%S').to_numpy()
df = df[df.columns[1:]]
df.index = df_index
data = np.expand_dims(df.values, axis=-1)
data = data[..., target_channel]
print(f'Raw time series shape: {data.shape}')
return data, df
def add_temporal_features(data, df):
'''Add time of day and day of week as features to the data.'''
l, n, _ = data.shape
feature_list = [data]
if add_time_of_day:
# numerical time_of_day
tod = [i % steps_per_day / steps_per_day for i in range(l)]
tod = np.array(tod)
tod_tiled = np.tile(tod, [1, n, 1]).transpose((2, 1, 0))
feature_list.append(tod_tiled)
if add_day_of_week:
# numerical day_of_week
dow = df.index.dayofweek / 7
dow_tiled = np.tile(dow, [1, n, 1]).transpose((2, 1, 0))
feature_list.append(dow_tiled)
if add_day_of_month:
# numerical day_of_month
dom = (df.index.day - 1) / 31 # df.index.day starts from 1. We need to minus 1 to make it start from 0.
dom_tiled = np.tile(dom, [1, n, 1]).transpose((2, 1, 0))
feature_list.append(dom_tiled)
if add_day_of_year:
# numerical day_of_year
doy = (df.index.dayofyear - 1) / 366 # df.index.month starts from 1. We need to minus 1 to make it start from 0.
doy_tiled = np.tile(doy, [1, n, 1]).transpose((2, 1, 0))
feature_list.append(doy_tiled)
data_with_features = np.concatenate(feature_list, axis=-1) # L x N x C
return data_with_features
def save_data(data):
'''Save the preprocessed data to a binary file.'''
if not os.path.exists(output_dir):
os.makedirs(output_dir)
file_path = os.path.join(output_dir, 'data.dat')
fp = np.memmap(file_path, dtype='float32', mode='w+', shape=data.shape)
fp[:] = data[:]
fp.flush()
del fp
print(f'Data saved to {file_path}')
def save_description(data):
'''Save a description of the dataset to a JSON file.'''
description = {
'name': dataset_name,
'domain': domain,
'shape': data.shape,
'num_time_steps': data.shape[0],
'num_nodes': data.shape[1],
'num_features': data.shape[2],
'feature_description': feature_description,
'has_graph': graph_file_path is not None,
'frequency (minutes)': frequency,
'regular_settings': regular_settings
}
description_path = os.path.join(output_dir, 'desc.json')
with open(description_path, 'w') as f:
json.dump(description, f, indent=4)
print(f'Description saved to {description_path}')
print(description)
def main():
# Load and preprocess data
data, df = load_and_preprocess_data()
# Add temporal features
data_with_features = add_temporal_features(data, df)
# Save processed data
save_data(data_with_features)
# Save dataset description
save_description(data_with_features)
if __name__ == '__main__':
main()

View File

@ -0,0 +1,121 @@
import json
import os
import numpy as np
import pandas as pd
# Hyperparameters
dataset_name = 'ETTm1'
data_file_path = f'datasets/raw_data/{dataset_name}/{dataset_name}.csv'
graph_file_path = None
output_dir = f'datasets/{dataset_name}'
target_channel = [0] # Target traffic flow channel
add_time_of_day = True # Add time of day as a feature
add_day_of_week = True # Add day of the week as a feature
add_day_of_month = True # Add day of the month as a feature
add_day_of_year = True # Add day of the year as a feature
steps_per_day = 96 # Number of time steps per day
frequency = 1440 // steps_per_day
domain = 'electricity transformer temperature'
feature_description = [domain, 'time of day', 'day of week', 'day of month', 'day of year']
regular_settings = {
'INPUT_LEN': 336,
'OUTPUT_LEN': 336,
'TRAIN_VAL_TEST_RATIO': [0.6, 0.2, 0.2],
'NORM_EACH_CHANNEL': True,
'RESCALE': False,
'METRICS': ['MAE', 'MSE'],
'NULL_VAL': np.nan
}
def load_and_preprocess_data():
'''Load and preprocess raw data, selecting the specified channel(s).'''
df = pd.read_csv(data_file_path)
df = df.iloc[:20*30*24*4]
df_index = pd.to_datetime(df['date'].values, format='%Y-%m-%d %H:%M:%S').to_numpy()
df = df[df.columns[1:]]
df.index = df_index
data = np.expand_dims(df.values, axis=-1)
data = data[..., target_channel]
print(f'Raw time series shape: {data.shape}')
return data, df
def add_temporal_features(data, df):
'''Add time of day and day of week as features to the data.'''
l, n, _ = data.shape
feature_list = [data]
if add_time_of_day:
# numerical time_of_day
tod = [i % steps_per_day / steps_per_day for i in range(l)]
tod = np.array(tod)
tod_tiled = np.tile(tod, [1, n, 1]).transpose((2, 1, 0))
feature_list.append(tod_tiled)
if add_day_of_week:
# numerical day_of_week
dow = df.index.dayofweek / 7
dow_tiled = np.tile(dow, [1, n, 1]).transpose((2, 1, 0))
feature_list.append(dow_tiled)
if add_day_of_month:
# numerical day_of_month
dom = (df.index.day - 1) / 31 # df.index.day starts from 1. We need to minus 1 to make it start from 0.
dom_tiled = np.tile(dom, [1, n, 1]).transpose((2, 1, 0))
feature_list.append(dom_tiled)
if add_day_of_year:
# numerical day_of_year
doy = (df.index.dayofyear - 1) / 366 # df.index.month starts from 1. We need to minus 1 to make it start from 0.
doy_tiled = np.tile(doy, [1, n, 1]).transpose((2, 1, 0))
feature_list.append(doy_tiled)
data_with_features = np.concatenate(feature_list, axis=-1) # L x N x C
return data_with_features
def save_data(data):
'''Save the preprocessed data to a binary file.'''
if not os.path.exists(output_dir):
os.makedirs(output_dir)
file_path = os.path.join(output_dir, 'data.dat')
fp = np.memmap(file_path, dtype='float32', mode='w+', shape=data.shape)
fp[:] = data[:]
fp.flush()
del fp
print(f'Data saved to {file_path}')
def save_description(data):
'''Save a description of the dataset to a JSON file.'''
description = {
'name': dataset_name,
'domain': domain,
'shape': data.shape,
'num_time_steps': data.shape[0],
'num_nodes': data.shape[1],
'num_features': data.shape[2],
'feature_description': feature_description,
'has_graph': graph_file_path is not None,
'frequency (minutes)': frequency,
'regular_settings': regular_settings
}
description_path = os.path.join(output_dir, 'desc.json')
with open(description_path, 'w') as f:
json.dump(description, f, indent=4)
print(f'Description saved to {description_path}')
print(description)
def main():
# Load and preprocess data
data, df = load_and_preprocess_data()
# Add temporal features
data_with_features = add_temporal_features(data, df)
# Save processed data
save_data(data_with_features)
# Save dataset description
save_description(data_with_features)
if __name__ == '__main__':
main()

View File

@ -0,0 +1,121 @@
import json
import os
import numpy as np
import pandas as pd
# Hyperparameters
dataset_name = 'ETTm2'
data_file_path = f'datasets/raw_data/{dataset_name}/{dataset_name}.csv'
graph_file_path = None
output_dir = f'datasets/{dataset_name}'
target_channel = [0] # Target traffic flow channel
add_time_of_day = True # Add time of day as a feature
add_day_of_week = True # Add day of the week as a feature
add_day_of_month = True # Add day of the month as a feature
add_day_of_year = True # Add day of the year as a feature
steps_per_day = 96 # Number of time steps per day
frequency = 1440 // steps_per_day
domain = 'electricity transformer temperature'
feature_description = [domain, 'time of day', 'day of week', 'day of month', 'day of year']
regular_settings = {
'INPUT_LEN': 336,
'OUTPUT_LEN': 336,
'TRAIN_VAL_TEST_RATIO': [0.6, 0.2, 0.2],
'NORM_EACH_CHANNEL': True,
'RESCALE': False,
'METRICS': ['MAE', 'MSE'],
'NULL_VAL': np.nan
}
def load_and_preprocess_data():
'''Load and preprocess raw data, selecting the specified channel(s).'''
df = pd.read_csv(data_file_path)
df = df.iloc[:20*30*24*4]
df_index = pd.to_datetime(df['date'].values, format='%Y-%m-%d %H:%M:%S').to_numpy()
df = df[df.columns[1:]]
df.index = df_index
data = np.expand_dims(df.values, axis=-1)
data = data[..., target_channel]
print(f'Raw time series shape: {data.shape}')
return data, df
def add_temporal_features(data, df):
'''Add time of day and day of week as features to the data.'''
l, n, _ = data.shape
feature_list = [data]
if add_time_of_day:
# numerical time_of_day
tod = [i % steps_per_day / steps_per_day for i in range(l)]
tod = np.array(tod)
tod_tiled = np.tile(tod, [1, n, 1]).transpose((2, 1, 0))
feature_list.append(tod_tiled)
if add_day_of_week:
# numerical day_of_week
dow = df.index.dayofweek / 7
dow_tiled = np.tile(dow, [1, n, 1]).transpose((2, 1, 0))
feature_list.append(dow_tiled)
if add_day_of_month:
# numerical day_of_month
dom = (df.index.day - 1) / 31 # df.index.day starts from 1. We need to minus 1 to make it start from 0.
dom_tiled = np.tile(dom, [1, n, 1]).transpose((2, 1, 0))
feature_list.append(dom_tiled)
if add_day_of_year:
# numerical day_of_year
doy = (df.index.dayofyear - 1) / 366 # df.index.month starts from 1. We need to minus 1 to make it start from 0.
doy_tiled = np.tile(doy, [1, n, 1]).transpose((2, 1, 0))
feature_list.append(doy_tiled)
data_with_features = np.concatenate(feature_list, axis=-1) # L x N x C
return data_with_features
def save_data(data):
'''Save the preprocessed data to a binary file.'''
if not os.path.exists(output_dir):
os.makedirs(output_dir)
file_path = os.path.join(output_dir, 'data.dat')
fp = np.memmap(file_path, dtype='float32', mode='w+', shape=data.shape)
fp[:] = data[:]
fp.flush()
del fp
print(f'Data saved to {file_path}')
def save_description(data):
'''Save a description of the dataset to a JSON file.'''
description = {
'name': dataset_name,
'domain': domain,
'shape': data.shape,
'num_time_steps': data.shape[0],
'num_nodes': data.shape[1],
'num_features': data.shape[2],
'feature_description': feature_description,
'has_graph': graph_file_path is not None,
'frequency (minutes)': frequency,
'regular_settings': regular_settings
}
description_path = os.path.join(output_dir, 'desc.json')
with open(description_path, 'w') as f:
json.dump(description, f, indent=4)
print(f'Description saved to {description_path}')
print(description)
def main():
# Load and preprocess data
data, df = load_and_preprocess_data()
# Add temporal features
data_with_features = add_temporal_features(data, df)
# Save processed data
save_data(data_with_features)
# Save dataset description
save_description(data_with_features)
if __name__ == '__main__':
main()

View File

@ -0,0 +1,120 @@
import json
import os
import numpy as np
import pandas as pd
# Hyperparameters
dataset_name = 'Electricity'
data_file_path = f'datasets/raw_data/{dataset_name}/{dataset_name}.csv'
graph_file_path = None
output_dir = f'datasets/{dataset_name}'
target_channel = [0] # Target traffic flow channel
add_time_of_day = True # Add time of day as a feature
add_day_of_week = True # Add day of the week as a feature
add_day_of_month = True # Add day of the month as a feature
add_day_of_year = True # Add day of the year as a feature
steps_per_day = 24 # Number of time steps per day
frequency = 1440 // steps_per_day
domain = 'electricity consumption'
feature_description = [domain, 'time of day', 'day of week', 'day of month', 'day of year']
regular_settings = {
'INPUT_LEN': 336,
'OUTPUT_LEN': 336,
'TRAIN_VAL_TEST_RATIO': [0.7, 0.1, 0.2],
'NORM_EACH_CHANNEL': True,
'RESCALE': False,
'METRICS': ['MAE', 'MSE'],
'NULL_VAL': np.nan
}
def load_and_preprocess_data():
'''Load and preprocess raw data, selecting the specified channel(s).'''
df = pd.read_csv(data_file_path)
df_index = pd.to_datetime(df['date'].values, format='%Y-%m-%d %H:%M:%S').to_numpy()
df = df[df.columns[1:]]
df.index = df_index
data = np.expand_dims(df.values, axis=-1)
data = data[..., target_channel]
print(f'Raw time series shape: {data.shape}')
return data, df
def add_temporal_features(data, df):
'''Add time of day and day of week as features to the data.'''
l, n, _ = data.shape
feature_list = [data]
if add_time_of_day:
# numerical time_of_day
tod = [i % steps_per_day / steps_per_day for i in range(l)]
tod = np.array(tod)
tod_tiled = np.tile(tod, [1, n, 1]).transpose((2, 1, 0))
feature_list.append(tod_tiled)
if add_day_of_week:
# numerical day_of_week
dow = df.index.dayofweek / 7
dow_tiled = np.tile(dow, [1, n, 1]).transpose((2, 1, 0))
feature_list.append(dow_tiled)
if add_day_of_month:
# numerical day_of_month
dom = (df.index.day - 1) / 31 # df.index.day starts from 1. We need to minus 1 to make it start from 0.
dom_tiled = np.tile(dom, [1, n, 1]).transpose((2, 1, 0))
feature_list.append(dom_tiled)
if add_day_of_year:
# numerical day_of_year
doy = (df.index.dayofyear - 1) / 366 # df.index.month starts from 1. We need to minus 1 to make it start from 0.
doy_tiled = np.tile(doy, [1, n, 1]).transpose((2, 1, 0))
feature_list.append(doy_tiled)
data_with_features = np.concatenate(feature_list, axis=-1) # L x N x C
return data_with_features
def save_data(data):
'''Save the preprocessed data to a binary file.'''
if not os.path.exists(output_dir):
os.makedirs(output_dir)
file_path = os.path.join(output_dir, 'data.dat')
fp = np.memmap(file_path, dtype='float32', mode='w+', shape=data.shape)
fp[:] = data[:]
fp.flush()
del fp
print(f'Data saved to {file_path}')
def save_description(data):
'''Save a description of the dataset to a JSON file.'''
description = {
'name': dataset_name,
'domain': domain,
'shape': data.shape,
'num_time_steps': data.shape[0],
'num_nodes': data.shape[1],
'num_features': data.shape[2],
'feature_description': feature_description,
'has_graph': graph_file_path is not None,
'frequency (minutes)': frequency,
'regular_settings': regular_settings
}
description_path = os.path.join(output_dir, 'desc.json')
with open(description_path, 'w') as f:
json.dump(description, f, indent=4)
print(f'Description saved to {description_path}')
print(description)
def main():
# Load and preprocess data
data, df = load_and_preprocess_data()
# Add temporal features
data_with_features = add_temporal_features(data, df)
# Save processed data
save_data(data_with_features)
# Save dataset description
save_description(data_with_features)
if __name__ == '__main__':
main()

View File

@ -0,0 +1,120 @@
import json
import os
import numpy as np
import pandas as pd
# Hyperparameters
dataset_name = 'ExchangeRate'
data_file_path = f'datasets/raw_data/{dataset_name}/{dataset_name}.csv'
graph_file_path = None
output_dir = f'datasets/{dataset_name}'
target_channel = [0] # Target traffic flow channel
add_time_of_day = True # Add time of day as a feature
add_day_of_week = True # Add day of the week as a feature
add_day_of_month = True # Add day of the month as a feature
add_day_of_year = True # Add day of the year as a feature
steps_per_day = 1 # Number of time steps per day
frequency = 1440 // steps_per_day
domain = 'exchange rate'
feature_description = [domain, 'time of day', 'day of week', 'day of month', 'day of year']
regular_settings = {
'INPUT_LEN': 336,
'OUTPUT_LEN': 336,
'TRAIN_VAL_TEST_RATIO': [0.7, 0.1, 0.2],
'NORM_EACH_CHANNEL': True,
'RESCALE': False,
'METRICS': ['MAE', 'MSE'],
'NULL_VAL': np.nan
}
def load_and_preprocess_data():
'''Load and preprocess raw data, selecting the specified channel(s).'''
df = pd.read_csv(data_file_path)
df_index = pd.to_datetime(df['date'].values, format='%Y/%m/%d %H:%M').to_numpy()
df = df[df.columns[1:]]
df.index = df_index
data = np.expand_dims(df.values, axis=-1)
data = data[..., target_channel]
print(f'Raw time series shape: {data.shape}')
return data, df
def add_temporal_features(data, df):
'''Add time of day and day of week as features to the data.'''
l, n, _ = data.shape
feature_list = [data]
if add_time_of_day:
# numerical time_of_day
tod = [i % steps_per_day / steps_per_day for i in range(l)]
tod = np.array(tod)
tod_tiled = np.tile(tod, [1, n, 1]).transpose((2, 1, 0))
feature_list.append(tod_tiled)
if add_day_of_week:
# numerical day_of_week
dow = df.index.dayofweek / 7
dow_tiled = np.tile(dow, [1, n, 1]).transpose((2, 1, 0))
feature_list.append(dow_tiled)
if add_day_of_month:
# numerical day_of_month
dom = (df.index.day - 1) / 31 # df.index.day starts from 1. We need to minus 1 to make it start from 0.
dom_tiled = np.tile(dom, [1, n, 1]).transpose((2, 1, 0))
feature_list.append(dom_tiled)
if add_day_of_year:
# numerical day_of_year
doy = (df.index.dayofyear - 1) / 366 # df.index.month starts from 1. We need to minus 1 to make it start from 0.
doy_tiled = np.tile(doy, [1, n, 1]).transpose((2, 1, 0))
feature_list.append(doy_tiled)
data_with_features = np.concatenate(feature_list, axis=-1) # L x N x C
return data_with_features
def save_data(data):
'''Save the preprocessed data to a binary file.'''
if not os.path.exists(output_dir):
os.makedirs(output_dir)
file_path = os.path.join(output_dir, 'data.dat')
fp = np.memmap(file_path, dtype='float32', mode='w+', shape=data.shape)
fp[:] = data[:]
fp.flush()
del fp
print(f'Data saved to {file_path}')
def save_description(data):
'''Save a description of the dataset to a JSON file.'''
description = {
'name': dataset_name,
'domain': domain,
'shape': data.shape,
'num_time_steps': data.shape[0],
'num_nodes': data.shape[1],
'num_features': data.shape[2],
'feature_description': feature_description,
'has_graph': graph_file_path is not None,
'frequency (minutes)': frequency,
'regular_settings': regular_settings
}
description_path = os.path.join(output_dir, 'desc.json')
with open(description_path, 'w') as f:
json.dump(description, f, indent=4)
print(f'Description saved to {description_path}')
print(description)
def main():
# Load and preprocess data
data, df = load_and_preprocess_data()
# Add temporal features
data_with_features = add_temporal_features(data, df)
# Save processed data
save_data(data_with_features)
# Save dataset description
save_description(data_with_features)
if __name__ == '__main__':
main()

View File

@ -0,0 +1,136 @@
import json
import os
import pickle
import shutil
import numpy as np
import pandas as pd
# Hyperparameters
dataset_name = 'GBA'
data_file_path = f'datasets/raw_data/{dataset_name}/{dataset_name}.h5'
graph_file_path = f'datasets/raw_data/{dataset_name}/adj_{dataset_name}.npy'
meta_file_path = f'datasets/raw_data/{dataset_name}/meta_{dataset_name}.csv'
output_dir = f'datasets/{dataset_name}'
target_channel = [0] # Target traffic flow channel
add_time_of_day = True # Add time of day as a feature
add_day_of_week = True # Add day of the week as a feature
add_day_of_month = False # Add day of the month as a feature
add_day_of_year = False # Add day of the year as a feature
steps_per_day = 96 # Number of time steps per day
frequency = 1440 // steps_per_day
domain = 'traffic flow'
feature_description = [domain, 'time of day', 'day of week']
regular_settings = {
'INPUT_LEN': 12,
'OUTPUT_LEN': 12,
'TRAIN_VAL_TEST_RATIO': [0.6, 0.2, 0.2],
'NORM_EACH_CHANNEL': False,
'RESCALE': True,
'METRICS': ['MAE', 'RMSE', 'MAPE'],
'NULL_VAL': 0.0
}
def load_and_preprocess_data():
'''Load and preprocess raw data, selecting the specified channel(s).'''
df = pd.read_hdf(data_file_path)
data = np.expand_dims(df.values, axis=-1)
data = data[..., target_channel]
print(f'Raw time series shape: {data.shape}')
return data, df
def add_temporal_features(data, df):
'''Add time of day and day of week as features to the data.'''
_, n, _ = data.shape
feature_list = [data]
if add_time_of_day:
time_of_day = (df.index.values - df.index.values.astype('datetime64[D]')) / np.timedelta64(1, 'D')
time_of_day_tiled = np.tile(time_of_day, [1, n, 1]).transpose((2, 1, 0))
feature_list.append(time_of_day_tiled)
if add_day_of_week:
day_of_week = df.index.dayofweek / 7
day_of_week_tiled = np.tile(day_of_week, [1, n, 1]).transpose((2, 1, 0))
feature_list.append(day_of_week_tiled)
if add_day_of_month:
# numerical day_of_month
day_of_month = (df.index.day - 1 ) / 31 # df.index.day starts from 1. We need to minus 1 to make it start from 0.
day_of_month_tiled = np.tile(day_of_month, [1, n, 1]).transpose((2, 1, 0))
feature_list.append(day_of_month_tiled)
if add_day_of_year:
# numerical day_of_year
day_of_year = (df.index.dayofyear - 1) / 366 # df.index.month starts from 1. We need to minus 1 to make it start from 0.
day_of_year_tiled = np.tile(day_of_year, [1, n, 1]).transpose((2, 1, 0))
feature_list.append(day_of_year_tiled)
data_with_features = np.concatenate(feature_list, axis=-1) # L x N x C
return data_with_features
def save_data(data):
'''Save the preprocessed data to a binary file.'''
if not os.path.exists(output_dir):
os.makedirs(output_dir)
file_path = os.path.join(output_dir, 'data.dat')
fp = np.memmap(file_path, dtype='float32', mode='w+', shape=data.shape)
fp[:] = data[:]
fp.flush()
del fp
print(f'Data saved to {file_path}')
def save_graph():
'''Save the adjacency matrix to the output directory.'''
output_graph_path = os.path.join(output_dir, 'adj_mx.pkl')
adj_mx = np.load(graph_file_path)
with open(output_dir + '/adj_mx.pkl', 'wb') as f:
pickle.dump(adj_mx, f)
print(f'Adjacency matrix saved to {output_graph_path}')
def save_meta_data():
'''Save the meta data to the output directory'''
output_meta_data_path = os.path.join(output_dir, 'meta.csv')
shutil.copyfile(meta_file_path, output_meta_data_path)
def save_description(data):
'''Save a description of the dataset to a JSON file.'''
description = {
'name': dataset_name,
'domain': domain,
'shape': data.shape,
'num_time_steps': data.shape[0],
'num_nodes': data.shape[1],
'num_features': data.shape[2],
'feature_description': feature_description,
'has_graph': graph_file_path is not None,
'frequency (minutes)': frequency,
'regular_settings': regular_settings
}
description_path = os.path.join(output_dir, 'desc.json')
with open(description_path, 'w') as f:
json.dump(description, f, indent=4)
print(f'Description saved to {description_path}')
print(description)
def main():
# Load and preprocess data
data, df = load_and_preprocess_data()
# Add temporal features
data_with_features = add_temporal_features(data, df)
# Save processed data
save_data(data_with_features)
# Copy and save adjacency matrix
save_graph()
# Copy and save meta data
save_meta_data()
# Save dataset description
save_description(data_with_features)
if __name__ == '__main__':
main()

View File

@ -0,0 +1,136 @@
import json
import os
import pickle
import shutil
import numpy as np
import pandas as pd
# Hyperparameters
dataset_name = 'GLA'
data_file_path = f'datasets/raw_data/{dataset_name}/{dataset_name}.h5'
graph_file_path = f'datasets/raw_data/{dataset_name}/adj_{dataset_name}.npy'
meta_file_path = f'datasets/raw_data/{dataset_name}/meta_{dataset_name}.csv'
output_dir = f'datasets/{dataset_name}'
target_channel = [0] # Target traffic flow channel
add_time_of_day = True # Add time of day as a feature
add_day_of_week = True # Add day of the week as a feature
add_day_of_month = False # Add day of the month as a feature
add_day_of_year = False # Add day of the year as a feature
steps_per_day = 96 # Number of time steps per day
frequency = 1440 // steps_per_day
domain = 'traffic flow'
feature_description = [domain, 'time of day', 'day of week']
regular_settings = {
'INPUT_LEN': 12,
'OUTPUT_LEN': 12,
'TRAIN_VAL_TEST_RATIO': [0.6, 0.2, 0.2],
'NORM_EACH_CHANNEL': False,
'RESCALE': True,
'METRICS': ['MAE', 'RMSE', 'MAPE'],
'NULL_VAL': 0.0
}
def load_and_preprocess_data():
'''Load and preprocess raw data, selecting the specified channel(s).'''
df = pd.read_hdf(data_file_path)
data = np.expand_dims(df.values, axis=-1)
data = data[..., target_channel]
print(f'Raw time series shape: {data.shape}')
return data, df
def add_temporal_features(data, df):
'''Add time of day and day of week as features to the data.'''
_, n, _ = data.shape
feature_list = [data]
if add_time_of_day:
time_of_day = (df.index.values - df.index.values.astype('datetime64[D]')) / np.timedelta64(1, 'D')
time_of_day_tiled = np.tile(time_of_day, [1, n, 1]).transpose((2, 1, 0))
feature_list.append(time_of_day_tiled)
if add_day_of_week:
day_of_week = df.index.dayofweek / 7
day_of_week_tiled = np.tile(day_of_week, [1, n, 1]).transpose((2, 1, 0))
feature_list.append(day_of_week_tiled)
if add_day_of_month:
# numerical day_of_month
day_of_month = (df.index.day - 1 ) / 31 # df.index.day starts from 1. We need to minus 1 to make it start from 0.
day_of_month_tiled = np.tile(day_of_month, [1, n, 1]).transpose((2, 1, 0))
feature_list.append(day_of_month_tiled)
if add_day_of_year:
# numerical day_of_year
day_of_year = (df.index.dayofyear - 1) / 366 # df.index.month starts from 1. We need to minus 1 to make it start from 0.
day_of_year_tiled = np.tile(day_of_year, [1, n, 1]).transpose((2, 1, 0))
feature_list.append(day_of_year_tiled)
data_with_features = np.concatenate(feature_list, axis=-1) # L x N x C
return data_with_features
def save_data(data):
'''Save the preprocessed data to a binary file.'''
if not os.path.exists(output_dir):
os.makedirs(output_dir)
file_path = os.path.join(output_dir, 'data.dat')
fp = np.memmap(file_path, dtype='float32', mode='w+', shape=data.shape)
fp[:] = data[:]
fp.flush()
del fp
print(f'Data saved to {file_path}')
def save_graph():
'''Save the adjacency matrix to the output directory.'''
output_graph_path = os.path.join(output_dir, 'adj_mx.pkl')
adj_mx = np.load(graph_file_path)
with open(output_dir + '/adj_mx.pkl', 'wb') as f:
pickle.dump(adj_mx, f)
print(f'Adjacency matrix saved to {output_graph_path}')
def save_meta_data():
'''Save the meta data to the output directory'''
output_meta_data_path = os.path.join(output_dir, 'meta.csv')
shutil.copyfile(meta_file_path, output_meta_data_path)
def save_description(data):
'''Save a description of the dataset to a JSON file.'''
description = {
'name': dataset_name,
'domain': domain,
'shape': data.shape,
'num_time_steps': data.shape[0],
'num_nodes': data.shape[1],
'num_features': data.shape[2],
'feature_description': feature_description,
'has_graph': graph_file_path is not None,
'frequency (minutes)': frequency,
'regular_settings': regular_settings
}
description_path = os.path.join(output_dir, 'desc.json')
with open(description_path, 'w') as f:
json.dump(description, f, indent=4)
print(f'Description saved to {description_path}')
print(description)
def main():
# Load and preprocess data
data, df = load_and_preprocess_data()
# Add temporal features
data_with_features = add_temporal_features(data, df)
# Save processed data
save_data(data_with_features)
# Copy and save adjacency matrix
save_graph()
# Copy and save meta data
save_meta_data()
# Save dataset description
save_description(data_with_features)
if __name__ == '__main__':
main()

View File

@ -0,0 +1,73 @@
import json
import os
import numpy as np
# Hyperparameters
dataset_name = 'Gaussian'
data_file_path = f'datasets/raw_data/{dataset_name}/{dataset_name}.npy'
graph_file_path = None
output_dir = f'datasets/{dataset_name}'
target_channel = [0] # Target traffic flow channel
frequency = None
domain = 'simulated Gaussian data'
feature_description = [domain]
regular_settings = {
'INPUT_LEN': 336,
'OUTPUT_LEN': 336,
'TRAIN_VAL_TEST_RATIO': [0.7, 0.1, 0.2],
'NORM_EACH_CHANNEL': False,
'RESCALE': True,
'NULL_VAL': np.nan
}
def load_and_preprocess_data():
'''Load and preprocess raw data, selecting the specified channel(s).'''
data = np.load(data_file_path)
data = data[..., target_channel]
print(f'Raw time series shape: {data.shape}')
return data
def save_data(data):
'''Save the preprocessed data to a binary file.'''
if not os.path.exists(output_dir):
os.makedirs(output_dir)
file_path = os.path.join(output_dir, 'data.dat')
fp = np.memmap(file_path, dtype='float32', mode='w+', shape=data.shape)
fp[:] = data[:]
fp.flush()
del fp
print(f'Data saved to {file_path}')
def save_description(data):
'''Save a description of the dataset to a JSON file.'''
description = {
'name': dataset_name,
'domain': domain,
'shape': data.shape,
'num_time_steps': data.shape[0],
'num_nodes': data.shape[1],
'num_features': data.shape[2],
'feature_description': feature_description,
'has_graph': graph_file_path is not None,
'frequency (minutes)': frequency,
'settings': regular_settings
}
description_path = os.path.join(output_dir, 'desc.json')
with open(description_path, 'w') as f:
json.dump(description, f, indent=4)
print(f'Description saved to {description_path}')
print(description)
def main():
# Load and preprocess data
data = load_and_preprocess_data()
# Save processed data
save_data(data)
# Save dataset description
save_description(data)
if __name__ == '__main__':
main()

View File

@ -0,0 +1,26 @@
import os
import numpy as np
import torch
PROJECT_DIR = os.path.abspath(__file__ + '/../../../..')
os.chdir(PROJECT_DIR)
# hyper parameterts
duration = 10000 # time series length
def generate_gaussian_noise_sequence():
x = np.arange(0, duration, 1)
y = np.random.normal(0, 1, duration)
return x, y
# generate gaussian sequence
time_points, gaussian_noise_sequence = generate_gaussian_noise_sequence()
# save pulse sequence
data = torch.Tensor(gaussian_noise_sequence).unsqueeze(-1).unsqueeze(-1).numpy()
# mkdir datasets/raw_data/Gaussian
if not os.path.exists('datasets/raw_data/Gaussian'):
os.makedirs('datasets/raw_data/Gaussian')
np.save('datasets/raw_data/Gaussian/Gaussian.npy', data)

View File

@ -0,0 +1,121 @@
import json
import os
import numpy as np
import pandas as pd
# Hyperparameters
dataset_name = 'GlobalTemp'
data_file_path = f'datasets/raw_data/{dataset_name}/{dataset_name}.csv'
graph_file_path = None
output_dir = f'datasets/{dataset_name}'
target_channel = [0] # Target traffic flow channel
add_time_of_day = True # Add time of day as a feature
add_day_of_week = True # Add day of the week as a feature
add_day_of_month = True # Add day of the month as a feature
add_day_of_year = True # Add day of the year as a feature
steps_per_day = 24 # Number of time steps per day
frequency = 1440 // steps_per_day # minutes
domain = 'global temperature'
feature_description = [domain, 'time of day', 'day of week', 'day of month', 'day of year']
regular_settings = {
'INPUT_LEN': 48,
'OUTPUT_LEN': 24,
'TRAIN_VAL_TEST_RATIO': [0.7, 0.1, 0.2],
'NORM_EACH_CHANNEL': False,
'RESCALE': True,
'METRICS': ['MAE', 'MSE'],
'NULL_VAL': np.nan
}
def load_and_preprocess_data():
'''Load and preprocess raw data, selecting the specified channel(s).'''
df = pd.read_csv(data_file_path)
df_index = pd.to_datetime(df['date'].values, format='%Y/%m/%d %H:%M').to_numpy()
df = df[df.columns[1:]]
df.index = df_index
data = np.expand_dims(df.values, axis=-1)
data = data[..., target_channel]
# data = data / 10
print(f'Raw time series shape: {data.shape}')
return data, df
def add_temporal_features(data, df):
'''Add time of day and day of week as features to the data.'''
l, n, _ = data.shape
feature_list = [data]
if add_time_of_day:
# numerical time_of_day
tod = [i % steps_per_day / steps_per_day for i in range(l)]
tod = np.array(tod)
tod_tiled = np.tile(tod, [1, n, 1]).transpose((2, 1, 0))
feature_list.append(tod_tiled)
if add_day_of_week:
# numerical day_of_week
dow = df.index.dayofweek / 7
dow_tiled = np.tile(dow, [1, n, 1]).transpose((2, 1, 0))
feature_list.append(dow_tiled)
if add_day_of_month:
# numerical day_of_month
dom = (df.index.day - 1) / 31 # df.index.day starts from 1. We need to minus 1 to make it start from 0.
dom_tiled = np.tile(dom, [1, n, 1]).transpose((2, 1, 0))
feature_list.append(dom_tiled)
if add_day_of_year:
# numerical day_of_year
doy = (df.index.dayofyear - 1) / 366 # df.index.month starts from 1. We need to minus 1 to make it start from 0.
doy_tiled = np.tile(doy, [1, n, 1]).transpose((2, 1, 0))
feature_list.append(doy_tiled)
data_with_features = np.concatenate(feature_list, axis=-1) # L x N x C
return data_with_features
def save_data(data):
'''Save the preprocessed data to a binary file.'''
if not os.path.exists(output_dir):
os.makedirs(output_dir)
file_path = os.path.join(output_dir, 'data.dat')
fp = np.memmap(file_path, dtype='float32', mode='w+', shape=data.shape)
fp[:] = data[:]
fp.flush()
del fp
print(f'Data saved to {file_path}')
def save_description(data):
'''Save a description of the dataset to a JSON file.'''
description = {
'name': dataset_name,
'domain': domain,
'shape': data.shape,
'num_time_steps': data.shape[0],
'num_nodes': data.shape[1],
'num_features': data.shape[2],
'feature_description': feature_description,
'has_graph': graph_file_path is not None,
'frequency (minutes)': frequency,
'regular_settings': regular_settings
}
description_path = os.path.join(output_dir, 'desc.json')
with open(description_path, 'w') as f:
json.dump(description, f, indent=4)
print(f'Description saved to {description_path}')
print(description)
def main():
# Load and preprocess data
data, df = load_and_preprocess_data()
# Add temporal features
data_with_features = add_temporal_features(data, df)
# Save processed data
save_data(data_with_features)
# Save dataset description
save_description(data_with_features)
if __name__ == '__main__':
main()

View File

@ -0,0 +1,121 @@
import json
import os
import numpy as np
import pandas as pd
# Hyperparameters
dataset_name = 'GlobalWind'
data_file_path = f'datasets/raw_data/{dataset_name}/{dataset_name}.csv'
graph_file_path = None
output_dir = f'datasets/{dataset_name}'
target_channel = [0] # Target traffic flow channel
add_time_of_day = True # Add time of day as a feature
add_day_of_week = True # Add day of the week as a feature
add_day_of_month = True # Add day of the month as a feature
add_day_of_year = True # Add day of the year as a feature
steps_per_day = 24 # Number of time steps per day
frequency = 1440 // steps_per_day # minutes
domain = 'global wind'
feature_description = [domain, 'time of day', 'day of week', 'day of month', 'day of year']
regular_settings = {
'INPUT_LEN': 48,
'OUTPUT_LEN': 24,
'TRAIN_VAL_TEST_RATIO': [0.7, 0.1, 0.2],
'NORM_EACH_CHANNEL': False,
'RESCALE': True,
'METRICS': ['MAE', 'MSE'],
'NULL_VAL': np.nan
}
def load_and_preprocess_data():
'''Load and preprocess raw data, selecting the specified channel(s).'''
df = pd.read_csv(data_file_path)
df_index = pd.to_datetime(df['date'].values, format='%Y-%m-%d %H:%M:%S').to_numpy()
df = df[df.columns[1:]]
df.index = df_index
data = np.expand_dims(df.values, axis=-1)
data = data[..., target_channel]
data = data / 10
print(f'Raw time series shape: {data.shape}')
return data, df
def add_temporal_features(data, df):
'''Add time of day and day of week as features to the data.'''
l, n, _ = data.shape
feature_list = [data]
if add_time_of_day:
# numerical time_of_day
tod = [i % steps_per_day / steps_per_day for i in range(l)]
tod = np.array(tod)
tod_tiled = np.tile(tod, [1, n, 1]).transpose((2, 1, 0))
feature_list.append(tod_tiled)
if add_day_of_week:
# numerical day_of_week
dow = df.index.dayofweek / 7
dow_tiled = np.tile(dow, [1, n, 1]).transpose((2, 1, 0))
feature_list.append(dow_tiled)
if add_day_of_month:
# numerical day_of_month
dom = (df.index.day - 1) / 31 # df.index.day starts from 1. We need to minus 1 to make it start from 0.
dom_tiled = np.tile(dom, [1, n, 1]).transpose((2, 1, 0))
feature_list.append(dom_tiled)
if add_day_of_year:
# numerical day_of_year
doy = (df.index.dayofyear - 1) / 366 # df.index.month starts from 1. We need to minus 1 to make it start from 0.
doy_tiled = np.tile(doy, [1, n, 1]).transpose((2, 1, 0))
feature_list.append(doy_tiled)
data_with_features = np.concatenate(feature_list, axis=-1) # L x N x C
return data_with_features
def save_data(data):
'''Save the preprocessed data to a binary file.'''
if not os.path.exists(output_dir):
os.makedirs(output_dir)
file_path = os.path.join(output_dir, 'data.dat')
fp = np.memmap(file_path, dtype='float32', mode='w+', shape=data.shape)
fp[:] = data[:]
fp.flush()
del fp
print(f'Data saved to {file_path}')
def save_description(data):
'''Save a description of the dataset to a JSON file.'''
description = {
'name': dataset_name,
'domain': domain,
'shape': data.shape,
'num_time_steps': data.shape[0],
'num_nodes': data.shape[1],
'num_features': data.shape[2],
'feature_description': feature_description,
'has_graph': graph_file_path is not None,
'frequency (minutes)': frequency,
'regular_settings': regular_settings
}
description_path = os.path.join(output_dir, 'desc.json')
with open(description_path, 'w') as f:
json.dump(description, f, indent=4)
print(f'Description saved to {description_path}')
print(description)
def main():
# Load and preprocess data
data, df = load_and_preprocess_data()
# Add temporal features
data_with_features = add_temporal_features(data, df)
# Save processed data
save_data(data_with_features)
# Save dataset description
save_description(data_with_features)
if __name__ == '__main__':
main()

View File

@ -0,0 +1,120 @@
import json
import os
import numpy as np
import pandas as pd
# Hyperparameters
dataset_name = 'Illness'
data_file_path = f'datasets/raw_data/{dataset_name}/{dataset_name}.csv'
graph_file_path = None
output_dir = f'datasets/{dataset_name}'
target_channel = [0] # Target traffic flow channel
add_time_of_day = True # Add time of day as a feature
add_day_of_week = True # Add day of the week as a feature
add_day_of_month = True # Add day of the month as a feature
add_day_of_year = True # Add day of the year as a feature
steps_per_day = 1/7 # Number of time steps per day
frequency = 1440 // steps_per_day
domain = 'illness data'
feature_description = [domain, 'time of day', 'day of week', 'day of month', 'day of year']
regular_settings = {
'INPUT_LEN': 96,
'OUTPUT_LEN': 48,
'TRAIN_VAL_TEST_RATIO': [0.7, 0.1, 0.2],
'NORM_EACH_CHANNEL': True,
'RESCALE': False,
'METRICS': ['MAE', 'MSE'],
'NULL_VAL': np.nan
}
def load_and_preprocess_data():
'''Load and preprocess raw data, selecting the specified channel(s).'''
df = pd.read_csv(data_file_path)
df_index = pd.to_datetime(df['date'].values, format='%Y-%m-%d %H:%M:%S').to_numpy()
df = df[df.columns[1:]]
df.index = df_index
data = np.expand_dims(df.values, axis=-1)
data = data[..., target_channel]
print(f'Raw time series shape: {data.shape}')
return data, df
def add_temporal_features(data, df):
'''Add time of day and day of week as features to the data.'''
_, n, _ = data.shape
feature_list = [data]
if add_time_of_day:
# numerical time_of_day
tod = (
df.index.values - df.index.values.astype('datetime64[D]')) / np.timedelta64(1, 'D')
tod_tiled = np.tile(tod, [1, n, 1]).transpose((2, 1, 0))
feature_list.append(tod_tiled)
if add_day_of_week:
# numerical day_of_week
dow = df.index.dayofweek / 7
dow_tiled = np.tile(dow, [1, n, 1]).transpose((2, 1, 0))
feature_list.append(dow_tiled)
if add_day_of_month:
# numerical day_of_month
dom = (df.index.day - 1) / 31 # df.index.day starts from 1. We need to minus 1 to make it start from 0.
dom_tiled = np.tile(dom, [1, n, 1]).transpose((2, 1, 0))
feature_list.append(dom_tiled)
if add_day_of_year:
# numerical day_of_year
doy = (df.index.dayofyear - 1) / 366 # df.index.month starts from 1. We need to minus 1 to make it start from 0.
doy_tiled = np.tile(doy, [1, n, 1]).transpose((2, 1, 0))
feature_list.append(doy_tiled)
data_with_features = np.concatenate(feature_list, axis=-1) # L x N x C
return data_with_features
def save_data(data):
'''Save the preprocessed data to a binary file.'''
if not os.path.exists(output_dir):
os.makedirs(output_dir)
file_path = os.path.join(output_dir, 'data.dat')
fp = np.memmap(file_path, dtype='float32', mode='w+', shape=data.shape)
fp[:] = data[:]
fp.flush()
del fp
print(f'Data saved to {file_path}')
def save_description(data):
'''Save a description of the dataset to a JSON file.'''
description = {
'name': dataset_name,
'domain': domain,
'shape': data.shape,
'num_time_steps': data.shape[0],
'num_nodes': data.shape[1],
'num_features': data.shape[2],
'feature_description': feature_description,
'has_graph': graph_file_path is not None,
'frequency (minutes)': frequency,
'regular_settings': regular_settings
}
description_path = os.path.join(output_dir, 'desc.json')
with open(description_path, 'w') as f:
json.dump(description, f, indent=4)
print(f'Description saved to {description_path}')
print(description)
def main():
# Load and preprocess data
data, df = load_and_preprocess_data()
# Add temporal features
data_with_features = add_temporal_features(data, df)
# Save processed data
save_data(data_with_features)
# Save dataset description
save_description(data_with_features)
if __name__ == '__main__':
main()

View File

@ -0,0 +1,84 @@
import csv
import os
import pickle
import numpy as np
def get_adjacency_matrix(distance_df_filename: str, num_of_vertices: int, id_filename: str = None) -> tuple:
"""Generate adjacency matrix.
Args:
distance_df_filename (str): path of the csv file contains edges information
num_of_vertices (int): number of vertices
id_filename (str, optional): id filename. Defaults to None.
Returns:
tuple: two adjacency matrix.
np.array: connectivity-based adjacency matrix A (A[i, j]=0 or A[i, j]=1)
np.array: distance-based adjacency matrix A
"""
if "npy" in distance_df_filename:
adj_mx = np.load(distance_df_filename)
return adj_mx, None
else:
adjacency_matrix_connectivity = np.zeros((int(num_of_vertices), int(
num_of_vertices)), dtype=np.float32)
adjacency_matrix_distance = np.zeros((int(num_of_vertices), int(num_of_vertices)),
dtype=np.float32)
if id_filename:
# the id in the distance file does not start from 0, so it needs to be remapped
with open(id_filename, "r") as f:
id_dict = {int(i): idx for idx, i in enumerate(
f.read().strip().split("\n"))} # map node idx to 0-based index (start from 0)
with open(distance_df_filename, "r") as f:
f.readline() # omit the first line
reader = csv.reader(f)
for row in reader:
if len(row) != 3:
continue
i, j, distance = int(row[0]), int(row[1]), float(row[2])
adjacency_matrix_connectivity[id_dict[i], id_dict[j]] = 1
adjacency_matrix_connectivity[id_dict[j], id_dict[i]] = 1
adjacency_matrix_distance[id_dict[i],
id_dict[j]] = distance
adjacency_matrix_distance[id_dict[j],
id_dict[i]] = distance
return adjacency_matrix_connectivity, adjacency_matrix_distance
else:
# ids in distance file start from 0
with open(distance_df_filename, "r") as f:
f.readline()
reader = csv.reader(f)
for row in reader:
if len(row) != 3:
continue
i, j, distance = int(row[0]), int(row[1]), float(row[2])
adjacency_matrix_connectivity[i, j] = 1
adjacency_matrix_connectivity[j, i] = 1
adjacency_matrix_distance[i, j] = distance
adjacency_matrix_distance[j, i] = distance
return adjacency_matrix_connectivity, adjacency_matrix_distance
def generate_adj_jinan():
distance_df_filename, num_of_vertices = "datasets/raw_data/JiNan/JiNan.csv", 406
if os.path.exists(distance_df_filename.split(".", maxsplit=1)[0] + ".txt"):
id_filename = distance_df_filename.split(".", maxsplit=1)[0] + ".txt"
else:
id_filename = None
adj_mx, distance_mx = get_adjacency_matrix(
distance_df_filename, num_of_vertices, id_filename=id_filename)
# the self loop is missing
add_self_loop = False
if add_self_loop:
print("adding self loop to adjacency matrices.")
adj_mx = adj_mx + np.identity(adj_mx.shape[0])
distance_mx = distance_mx + np.identity(distance_mx.shape[0])
else:
print("kindly note that there is no self loop in adjacency matrices.")
with open("datasets/raw_data/JiNan/adj_JiNan.pkl", "wb") as f:
pickle.dump(adj_mx, f)
with open("datasets/raw_data/JiNan/adj_JiNan_distance.pkl", "wb") as f:
pickle.dump(distance_mx, f)

View File

@ -0,0 +1,113 @@
import json
import os
import shutil
import numpy as np
from generate_adj_mx import generate_adj_jinan as generate_adj
# Hyperparameters
dataset_name = 'JiNan'
data_file_path = f'datasets/raw_data/{dataset_name}/{dataset_name}.npz'
graph_file_path = f'datasets/raw_data/{dataset_name}/adj_{dataset_name}.pkl'
output_dir = f'datasets/{dataset_name}'
target_channel = [0] # Target traffic flow channel
add_time_of_day = True # Add time of day as a feature
add_day_of_week = True # Add day of the week as a feature
steps_per_day = 288 # Number of time steps per day
frequency = 1440 // steps_per_day
domain = 'traffic flow'
feature_description = [domain, 'time of day', 'day of week']
regular_settings = {
'INPUT_LEN': 12,
'OUTPUT_LEN': 12,
'TRAIN_VAL_TEST_RATIO': [0.6, 0.2, 0.2],
'NORM_EACH_CHANNEL': False,
'RESCALE': True,
'METRICS': ['MAE', 'RMSE', 'MAPE'],
'NULL_VAL': 0.0
}
def load_and_preprocess_data():
'''Load and preprocess raw data, selecting the specified channel(s).'''
data = np.load(data_file_path)['data']
data = data[..., target_channel]
print(f'Raw time series shape: {data.shape}')
return data
def add_temporal_features(data):
'''Add time of day and day of week as features to the data.'''
l, n, _ = data.shape
feature_list = [data]
if add_time_of_day:
time_of_day = np.array([i % steps_per_day / steps_per_day for i in range(l)])
time_of_day_tiled = np.tile(time_of_day, [1, n, 1]).transpose((2, 1, 0))
feature_list.append(time_of_day_tiled)
if add_day_of_week:
day_of_week = np.array([(i // steps_per_day) % 7 / 7 for i in range(l)])
day_of_week_tiled = np.tile(day_of_week, [1, n, 1]).transpose((2, 1, 0))
feature_list.append(day_of_week_tiled)
data_with_features = np.concatenate(feature_list, axis=-1) # L x N x C
return data_with_features
def save_data(data):
'''Save the preprocessed data to a binary file.'''
if not os.path.exists(output_dir):
os.makedirs(output_dir)
file_path = os.path.join(output_dir, 'data.dat')
fp = np.memmap(file_path, dtype='float32', mode='w+', shape=data.shape)
fp[:] = data[:]
fp.flush()
del fp
print(f'Data saved to {file_path}')
def save_graph():
'''Save the adjacency matrix to the output directory, generating it if necessary.'''
output_graph_path = os.path.join(output_dir, 'adj_mx.pkl')
if os.path.exists(graph_file_path):
shutil.copyfile(graph_file_path, output_graph_path)
else:
generate_adj()
shutil.copyfile(graph_file_path, output_graph_path)
print(f'Adjacency matrix saved to {output_graph_path}')
def save_description(data):
'''Save a description of the dataset to a JSON file.'''
description = {
'name': dataset_name,
'domain': domain,
'shape': data.shape,
'num_time_steps': data.shape[0],
'num_nodes': data.shape[1],
'num_features': data.shape[2],
'feature_description': feature_description,
'has_graph': graph_file_path is not None,
'frequency (minutes)': frequency,
'regular_settings': regular_settings
}
description_path = os.path.join(output_dir, 'desc.json')
with open(description_path, 'w') as f:
json.dump(description, f, indent=4)
print(f'Description saved to {description_path}')
print(description)
def main():
# Load and preprocess data
data = load_and_preprocess_data()
# Add temporal features
data_with_features = add_temporal_features(data)
# Save processed data
save_data(data_with_features)
# Copy or generate and save adjacency matrix
save_graph()
# Save dataset description
save_description(data_with_features)
if __name__ == '__main__':
main()

View File

@ -0,0 +1,124 @@
import json
import os
import shutil
import numpy as np
import pandas as pd
# Hyperparameters
dataset_name = 'METR-LA'
data_file_path = f'datasets/raw_data/{dataset_name}/{dataset_name}.h5'
graph_file_path = f'datasets/raw_data/{dataset_name}/adj_{dataset_name}.pkl'
output_dir = f'datasets/{dataset_name}'
target_channel = [0] # Target traffic flow channel
add_time_of_day = True # Add time of day as a feature
add_day_of_week = True # Add day of the week as a feature
add_day_of_month = False # Add day of the month as a feature
add_day_of_year = False # Add day of the year as a feature
steps_per_day = 288 # Number of time steps per day
frequency = 1440 // steps_per_day
domain = 'traffic speed'
feature_description = [domain, 'time of day', 'day of week']
regular_settings = {
'INPUT_LEN': 12,
'OUTPUT_LEN': 12,
'TRAIN_VAL_TEST_RATIO': [0.7, 0.1, 0.2],
'NORM_EACH_CHANNEL': False,
'RESCALE': True,
'METRICS': ['MAE', 'RMSE', 'MAPE'],
'NULL_VAL': 0.0
}
def load_and_preprocess_data():
'''Load and preprocess raw data, selecting the specified channel(s).'''
df = pd.read_hdf(data_file_path)
data = np.expand_dims(df.values, axis=-1)
data = data[..., target_channel]
print(f'Raw time series shape: {data.shape}')
return data, df
def add_temporal_features(data, df):
'''Add time of day and day of week as features to the data.'''
_, n, _ = data.shape
feature_list = [data]
if add_time_of_day:
time_of_day = (df.index.values - df.index.values.astype('datetime64[D]')) / np.timedelta64(1, 'D')
time_of_day_tiled = np.tile(time_of_day, [1, n, 1]).transpose((2, 1, 0))
feature_list.append(time_of_day_tiled)
if add_day_of_week:
day_of_week = df.index.dayofweek / 7
day_of_week_tiled = np.tile(day_of_week, [1, n, 1]).transpose((2, 1, 0))
feature_list.append(day_of_week_tiled)
if add_day_of_month:
# numerical day_of_month
day_of_month = (df.index.day - 1 ) / 31 # df.index.day starts from 1. We need to minus 1 to make it start from 0.
day_of_month_tiled = np.tile(day_of_month, [1, n, 1]).transpose((2, 1, 0))
feature_list.append(day_of_month_tiled)
if add_day_of_year:
# numerical day_of_year
day_of_year = (df.index.dayofyear - 1) / 366 # df.index.month starts from 1. We need to minus 1 to make it start from 0.
day_of_year_tiled = np.tile(day_of_year, [1, n, 1]).transpose((2, 1, 0))
feature_list.append(day_of_year_tiled)
data_with_features = np.concatenate(feature_list, axis=-1) # L x N x C
return data_with_features
def save_data(data):
'''Save the preprocessed data to a binary file.'''
if not os.path.exists(output_dir):
os.makedirs(output_dir)
file_path = os.path.join(output_dir, 'data.dat')
fp = np.memmap(file_path, dtype='float32', mode='w+', shape=data.shape)
fp[:] = data[:]
fp.flush()
del fp
print(f'Data saved to {file_path}')
def save_graph():
'''Save the adjacency matrix to the output directory.'''
output_graph_path = os.path.join(output_dir, 'adj_mx.pkl')
shutil.copyfile(graph_file_path, output_graph_path)
print(f'Adjacency matrix saved to {output_graph_path}')
def save_description(data):
'''Save a description of the dataset to a JSON file.'''
description = {
'name': dataset_name,
'domain': domain,
'shape': data.shape,
'num_time_steps': data.shape[0],
'num_nodes': data.shape[1],
'num_features': data.shape[2],
'feature_description': feature_description,
'has_graph': graph_file_path is not None,
'frequency (minutes)': frequency,
'regular_settings': regular_settings
}
description_path = os.path.join(output_dir, 'desc.json')
with open(description_path, 'w') as f:
json.dump(description, f, indent=4)
print(f'Description saved to {description_path}')
print(description)
def main():
# Load and preprocess data
data, df = load_and_preprocess_data()
# Add temporal features
data_with_features = add_temporal_features(data, df)
# Save processed data
save_data(data_with_features)
# Copy and save adjacency matrix
save_graph()
# Save dataset description
save_description(data_with_features)
if __name__ == '__main__':
main()

View File

@ -0,0 +1,124 @@
import json
import os
import shutil
import numpy as np
import pandas as pd
# Hyperparameters
dataset_name = 'PEMS-BAY'
data_file_path = f'datasets/raw_data/{dataset_name}/{dataset_name}.h5'
graph_file_path = f'datasets/raw_data/{dataset_name}/adj_{dataset_name}.pkl'
output_dir = f'datasets/{dataset_name}'
target_channel = [0] # Target traffic flow channel
add_time_of_day = True # Add time of day as a feature
add_day_of_week = True # Add day of the week as a feature
add_day_of_month = False # Add day of the month as a feature
add_day_of_year = False # Add day of the year as a feature
steps_per_day = 288 # Number of time steps per day
frequency = 1440 // steps_per_day
domain = 'traffic speed'
feature_description = [domain, 'time of day', 'day of week']
regular_settings = {
'INPUT_LEN': 12,
'OUTPUT_LEN': 12,
'TRAIN_VAL_TEST_RATIO': [0.7, 0.1, 0.2],
'NORM_EACH_CHANNEL': False,
'RESCALE': True,
'METRICS': ['MAE', 'RMSE', 'MAPE'],
'NULL_VAL': 0.0
}
def load_and_preprocess_data():
'''Load and preprocess raw data, selecting the specified channel(s).'''
df = pd.read_hdf(data_file_path)
data = np.expand_dims(df.values, axis=-1)
data = data[..., target_channel]
print(f'Raw time series shape: {data.shape}')
return data, df
def add_temporal_features(data, df):
'''Add time of day and day of week as features to the data.'''
_, n, _ = data.shape
feature_list = [data]
if add_time_of_day:
time_of_day = (df.index.values - df.index.values.astype('datetime64[D]')) / np.timedelta64(1, 'D')
time_of_day_tiled = np.tile(time_of_day, [1, n, 1]).transpose((2, 1, 0))
feature_list.append(time_of_day_tiled)
if add_day_of_week:
day_of_week = df.index.dayofweek / 7
day_of_week_tiled = np.tile(day_of_week, [1, n, 1]).transpose((2, 1, 0))
feature_list.append(day_of_week_tiled)
if add_day_of_month:
# numerical day_of_month
day_of_month = (df.index.day - 1 ) / 31 # df.index.day starts from 1. We need to minus 1 to make it start from 0.
day_of_month_tiled = np.tile(day_of_month, [1, n, 1]).transpose((2, 1, 0))
feature_list.append(day_of_month_tiled)
if add_day_of_year:
# numerical day_of_year
day_of_year = (df.index.dayofyear - 1) / 366 # df.index.month starts from 1. We need to minus 1 to make it start from 0.
day_of_year_tiled = np.tile(day_of_year, [1, n, 1]).transpose((2, 1, 0))
feature_list.append(day_of_year_tiled)
data_with_features = np.concatenate(feature_list, axis=-1) # L x N x C
return data_with_features
def save_data(data):
'''Save the preprocessed data to a binary file.'''
if not os.path.exists(output_dir):
os.makedirs(output_dir)
file_path = os.path.join(output_dir, 'data.dat')
fp = np.memmap(file_path, dtype='float32', mode='w+', shape=data.shape)
fp[:] = data[:]
fp.flush()
del fp
print(f'Data saved to {file_path}')
def save_graph():
'''Save the adjacency matrix to the output directory.'''
output_graph_path = os.path.join(output_dir, 'adj_mx.pkl')
shutil.copyfile(graph_file_path, output_graph_path)
print(f'Adjacency matrix saved to {output_graph_path}')
def save_description(data):
'''Save a description of the dataset to a JSON file.'''
description = {
'name': dataset_name,
'domain': domain,
'shape': data.shape,
'num_time_steps': data.shape[0],
'num_nodes': data.shape[1],
'num_features': data.shape[2],
'feature_description': feature_description,
'has_graph': graph_file_path is not None,
'frequency (minutes)': frequency,
'regular_settings': regular_settings
}
description_path = os.path.join(output_dir, 'desc.json')
with open(description_path, 'w') as f:
json.dump(description, f, indent=4)
print(f'Description saved to {description_path}')
print(description)
def main():
# Load and preprocess data
data, df = load_and_preprocess_data()
# Add temporal features
data_with_features = add_temporal_features(data, df)
# Save processed data
save_data(data_with_features)
# Copy and save adjacency matrix
save_graph()
# Save dataset description
save_description(data_with_features)
if __name__ == '__main__':
main()

View File

@ -0,0 +1,84 @@
import csv
import os
import pickle
import numpy as np
def get_adjacency_matrix(distance_df_filename: str, num_of_vertices: int, id_filename: str = None) -> tuple:
"""Generate adjacency matrix.
Args:
distance_df_filename (str): path of the csv file contains edges information
num_of_vertices (int): number of vertices
id_filename (str, optional): id filename. Defaults to None.
Returns:
tuple: two adjacency matrix.
np.array: connectivity-based adjacency matrix A (A[i, j]=0 or A[i, j]=1)
np.array: distance-based adjacency matrix A
"""
if "npy" in distance_df_filename:
adj_mx = np.load(distance_df_filename)
return adj_mx, None
else:
adjacency_matrix_connectivity = np.zeros((int(num_of_vertices), int(
num_of_vertices)), dtype=np.float32)
adjacency_matrix_distance = np.zeros((int(num_of_vertices), int(num_of_vertices)),
dtype=np.float32)
if id_filename:
# the id in the distance file does not start from 0, so it needs to be remapped
with open(id_filename, "r") as f:
id_dict = {int(i): idx for idx, i in enumerate(
f.read().strip().split("\n"))} # map node idx to 0-based index (start from 0)
with open(distance_df_filename, "r") as f:
f.readline() # omit the first line
reader = csv.reader(f)
for row in reader:
if len(row) != 3:
continue
i, j, distance = int(row[0]), int(row[1]), float(row[2])
adjacency_matrix_connectivity[id_dict[i], id_dict[j]] = 1
adjacency_matrix_connectivity[id_dict[j], id_dict[i]] = 1
adjacency_matrix_distance[id_dict[i],
id_dict[j]] = distance
adjacency_matrix_distance[id_dict[j],
id_dict[i]] = distance
return adjacency_matrix_connectivity, adjacency_matrix_distance
else:
# ids in distance file start from 0
with open(distance_df_filename, "r") as f:
f.readline()
reader = csv.reader(f)
for row in reader:
if len(row) != 3:
continue
i, j, distance = int(row[0]), int(row[1]), float(row[2])
adjacency_matrix_connectivity[i, j] = 1
adjacency_matrix_connectivity[j, i] = 1
adjacency_matrix_distance[i, j] = distance
adjacency_matrix_distance[j, i] = distance
return adjacency_matrix_connectivity, adjacency_matrix_distance
def generate_adj_pems03():
distance_df_filename, num_of_vertices = "datasets/raw_data/PEMS03/PEMS03.csv", 358
if os.path.exists(distance_df_filename.split(".", maxsplit=1)[0] + ".txt"):
id_filename = distance_df_filename.split(".", maxsplit=1)[0] + ".txt"
else:
id_filename = None
adj_mx, distance_mx = get_adjacency_matrix(
distance_df_filename, num_of_vertices, id_filename=id_filename)
# the self loop is missing
add_self_loop = False
if add_self_loop:
print("adding self loop to adjacency matrices.")
adj_mx = adj_mx + np.identity(adj_mx.shape[0])
distance_mx = distance_mx + np.identity(distance_mx.shape[0])
else:
print("kindly note that there is no self loop in adjacency matrices.")
with open("datasets/raw_data/PEMS03/adj_PEMS03.pkl", "wb") as f:
pickle.dump(adj_mx, f)
with open("datasets/raw_data/PEMS03/adj_PEMS03_distance.pkl", "wb") as f:
pickle.dump(distance_mx, f)

View File

@ -0,0 +1,113 @@
import json
import os
import shutil
import numpy as np
from generate_adj_mx import generate_adj_pems03 as generate_adj
# Hyperparameters
dataset_name = 'PEMS03'
data_file_path = f'datasets/raw_data/{dataset_name}/{dataset_name}.npz'
graph_file_path = f'datasets/raw_data/{dataset_name}/adj_{dataset_name}.pkl'
output_dir = f'datasets/{dataset_name}'
target_channel = [0] # Target traffic flow channel
add_time_of_day = True # Add time of day as a feature
add_day_of_week = True # Add day of the week as a feature
steps_per_day = 288 # Number of time steps per day
frequency = 1440 // steps_per_day
domain = 'traffic flow'
feature_description = [domain, 'time of day', 'day of week']
regular_settings = {
'INPUT_LEN': 12,
'OUTPUT_LEN': 12,
'TRAIN_VAL_TEST_RATIO': [0.6, 0.2, 0.2],
'NORM_EACH_CHANNEL': False,
'RESCALE': True,
'METRICS': ['MAE', 'RMSE', 'MAPE'],
'NULL_VAL': 0.0
}
def load_and_preprocess_data():
'''Load and preprocess raw data, selecting the specified channel(s).'''
data = np.load(data_file_path)['data']
data = data[..., target_channel]
print(f'Raw time series shape: {data.shape}')
return data
def add_temporal_features(data):
'''Add time of day and day of week as features to the data.'''
l, n, _ = data.shape
feature_list = [data]
if add_time_of_day:
time_of_day = np.array([i % steps_per_day / steps_per_day for i in range(l)])
time_of_day_tiled = np.tile(time_of_day, [1, n, 1]).transpose((2, 1, 0))
feature_list.append(time_of_day_tiled)
if add_day_of_week:
day_of_week = np.array([(i // steps_per_day) % 7 / 7 for i in range(l)])
day_of_week_tiled = np.tile(day_of_week, [1, n, 1]).transpose((2, 1, 0))
feature_list.append(day_of_week_tiled)
data_with_features = np.concatenate(feature_list, axis=-1) # L x N x C
return data_with_features
def save_data(data):
'''Save the preprocessed data to a binary file.'''
if not os.path.exists(output_dir):
os.makedirs(output_dir)
file_path = os.path.join(output_dir, 'data.dat')
fp = np.memmap(file_path, dtype='float32', mode='w+', shape=data.shape)
fp[:] = data[:]
fp.flush()
del fp
print(f'Data saved to {file_path}')
def save_graph():
'''Save the adjacency matrix to the output directory, generating it if necessary.'''
output_graph_path = os.path.join(output_dir, 'adj_mx.pkl')
if os.path.exists(graph_file_path):
shutil.copyfile(graph_file_path, output_graph_path)
else:
generate_adj()
shutil.copyfile(graph_file_path, output_graph_path)
print(f'Adjacency matrix saved to {output_graph_path}')
def save_description(data):
'''Save a description of the dataset to a JSON file.'''
description = {
'name': dataset_name,
'domain': domain,
'shape': data.shape,
'num_time_steps': data.shape[0],
'num_nodes': data.shape[1],
'num_features': data.shape[2],
'feature_description': feature_description,
'has_graph': graph_file_path is not None,
'frequency (minutes)': frequency,
'regular_settings': regular_settings
}
description_path = os.path.join(output_dir, 'desc.json')
with open(description_path, 'w') as f:
json.dump(description, f, indent=4)
print(f'Description saved to {description_path}')
print(description)
def main():
# Load and preprocess data
data = load_and_preprocess_data()
# Add temporal features
data_with_features = add_temporal_features(data)
# Save processed data
save_data(data_with_features)
# Copy or generate and save adjacency matrix
save_graph()
# Save dataset description
save_description(data_with_features)
if __name__ == '__main__':
main()

View File

@ -0,0 +1,84 @@
import csv
import os
import pickle
import numpy as np
def get_adjacency_matrix(distance_df_filename: str, num_of_vertices: int, id_filename: str = None) -> tuple:
"""Generate adjacency matrix.
Args:
distance_df_filename (str): path of the csv file contains edges information
num_of_vertices (int): number of vertices
id_filename (str, optional): id filename. Defaults to None.
Returns:
tuple: two adjacency matrix.
np.array: connectivity-based adjacency matrix A (A[i, j]=0 or A[i, j]=1)
np.array: distance-based adjacency matrix A
"""
if "npy" in distance_df_filename:
adj_mx = np.load(distance_df_filename)
return adj_mx, None
else:
adjacency_matrix_connectivity = np.zeros((int(num_of_vertices), int(
num_of_vertices)), dtype=np.float32)
adjacency_matrix_distance = np.zeros((int(num_of_vertices), int(num_of_vertices)),
dtype=np.float32)
if id_filename:
# the id in the distance file does not start from 0, so it needs to be remapped
with open(id_filename, "r") as f:
id_dict = {int(i): idx for idx, i in enumerate(
f.read().strip().split("\n"))} # map node idx to 0-based index (start from 0)
with open(distance_df_filename, "r") as f:
f.readline() # omit the first line
reader = csv.reader(f)
for row in reader:
if len(row) != 3:
continue
i, j, distance = int(row[0]), int(row[1]), float(row[2])
adjacency_matrix_connectivity[id_dict[i], id_dict[j]] = 1
adjacency_matrix_connectivity[id_dict[j], id_dict[i]] = 1
adjacency_matrix_distance[id_dict[i],
id_dict[j]] = distance
adjacency_matrix_distance[id_dict[j],
id_dict[i]] = distance
return adjacency_matrix_connectivity, adjacency_matrix_distance
else:
# ids in distance file start from 0
with open(distance_df_filename, "r") as f:
f.readline()
reader = csv.reader(f)
for row in reader:
if len(row) != 3:
continue
i, j, distance = int(row[0]), int(row[1]), float(row[2])
adjacency_matrix_connectivity[i, j] = 1
adjacency_matrix_connectivity[j, i] = 1
adjacency_matrix_distance[i, j] = distance
adjacency_matrix_distance[j, i] = distance
return adjacency_matrix_connectivity, adjacency_matrix_distance
def generate_adj_pems04():
distance_df_filename, num_of_vertices = "datasets/raw_data/PEMS04/PEMS04.csv", 307
if os.path.exists(distance_df_filename.split(".", maxsplit=1)[0] + ".txt"):
id_filename = distance_df_filename.split(".", maxsplit=1)[0] + ".txt"
else:
id_filename = None
adj_mx, distance_mx = get_adjacency_matrix(
distance_df_filename, num_of_vertices, id_filename=id_filename)
# the self loop is missing
add_self_loop = False
if add_self_loop:
print("adding self loop to adjacency matrices.")
adj_mx = adj_mx + np.identity(adj_mx.shape[0])
distance_mx = distance_mx + np.identity(distance_mx.shape[0])
else:
print("kindly note that there is no self loop in adjacency matrices.")
with open("datasets/raw_data/PEMS04/adj_PEMS04.pkl", "wb") as f:
pickle.dump(adj_mx, f)
with open("datasets/raw_data/PEMS04/adj_PEMS04_distance.pkl", "wb") as f:
pickle.dump(distance_mx, f)

View File

@ -0,0 +1,113 @@
import json
import os
import shutil
import numpy as np
from generate_adj_mx import generate_adj_pems04 as generate_adj
# Hyperparameters
dataset_name = 'PEMS04'
data_file_path = f'datasets/raw_data/{dataset_name}/{dataset_name}.npz'
graph_file_path = f'datasets/raw_data/{dataset_name}/adj_{dataset_name}.pkl'
output_dir = f'datasets/{dataset_name}'
target_channel = [0] # Target traffic flow channel
add_time_of_day = True # Add time of day as a feature
add_day_of_week = True # Add day of the week as a feature
steps_per_day = 288 # Number of time steps per day
frequency = 1440 // steps_per_day
domain = 'traffic flow'
feature_description = [domain, 'time of day', 'day of week']
regular_settings = {
'INPUT_LEN': 12,
'OUTPUT_LEN': 12,
'TRAIN_VAL_TEST_RATIO': [0.6, 0.2, 0.2],
'NORM_EACH_CHANNEL': False,
'RESCALE': True,
'METRICS': ['MAE', 'RMSE', 'MAPE'],
'NULL_VAL': 0.0
}
def load_and_preprocess_data():
'''Load and preprocess raw data, selecting the specified channel(s).'''
data = np.load(data_file_path)['data']
data = data[..., target_channel]
print(f'Raw time series shape: {data.shape}')
return data
def add_temporal_features(data):
'''Add time of day and day of week as features to the data.'''
l, n, _ = data.shape
feature_list = [data]
if add_time_of_day:
time_of_day = np.array([i % steps_per_day / steps_per_day for i in range(l)])
time_of_day_tiled = np.tile(time_of_day, [1, n, 1]).transpose((2, 1, 0))
feature_list.append(time_of_day_tiled)
if add_day_of_week:
day_of_week = np.array([(i // steps_per_day) % 7 / 7 for i in range(l)])
day_of_week_tiled = np.tile(day_of_week, [1, n, 1]).transpose((2, 1, 0))
feature_list.append(day_of_week_tiled)
data_with_features = np.concatenate(feature_list, axis=-1) # L x N x C
return data_with_features
def save_data(data):
'''Save the preprocessed data to a binary file.'''
if not os.path.exists(output_dir):
os.makedirs(output_dir)
file_path = os.path.join(output_dir, 'data.dat')
fp = np.memmap(file_path, dtype='float32', mode='w+', shape=data.shape)
fp[:] = data[:]
fp.flush()
del fp
print(f'Data saved to {file_path}')
def save_graph():
'''Save the adjacency matrix to the output directory, generating it if necessary.'''
output_graph_path = os.path.join(output_dir, 'adj_mx.pkl')
if os.path.exists(graph_file_path):
shutil.copyfile(graph_file_path, output_graph_path)
else:
generate_adj()
shutil.copyfile(graph_file_path, output_graph_path)
print(f'Adjacency matrix saved to {output_graph_path}')
def save_description(data):
'''Save a description of the dataset to a JSON file.'''
description = {
'name': dataset_name,
'domain': domain,
'shape': data.shape,
'num_time_steps': data.shape[0],
'num_nodes': data.shape[1],
'num_features': data.shape[2],
'feature_description': feature_description,
'has_graph': graph_file_path is not None,
'frequency (minutes)': frequency,
'regular_settings': regular_settings
}
description_path = os.path.join(output_dir, 'desc.json')
with open(description_path, 'w') as f:
json.dump(description, f, indent=4)
print(f'Description saved to {description_path}')
print(description)
def main():
# Load and preprocess data
data = load_and_preprocess_data()
# Add temporal features
data_with_features = add_temporal_features(data)
# Save processed data
save_data(data_with_features)
# Copy or generate and save adjacency matrix
save_graph()
# Save dataset description
save_description(data_with_features)
if __name__ == '__main__':
main()

View File

@ -0,0 +1,84 @@
import csv
import os
import pickle
import numpy as np
def get_adjacency_matrix(distance_df_filename: str, num_of_vertices: int, id_filename: str = None) -> tuple:
"""Generate adjacency matrix.
Args:
distance_df_filename (str): path of the csv file contains edges information
num_of_vertices (int): number of vertices
id_filename (str, optional): id filename. Defaults to None.
Returns:
tuple: two adjacency matrix.
np.array: connectivity-based adjacency matrix A (A[i, j]=0 or A[i, j]=1)
np.array: distance-based adjacency matrix A
"""
if "npy" in distance_df_filename:
adj_mx = np.load(distance_df_filename)
return adj_mx, None
else:
adjacency_matrix_connectivity = np.zeros((int(num_of_vertices), int(
num_of_vertices)), dtype=np.float32)
adjacency_matrix_distance = np.zeros((int(num_of_vertices), int(num_of_vertices)),
dtype=np.float32)
if id_filename:
# the id in the distance file does not start from 0, so it needs to be remapped
with open(id_filename, "r") as f:
id_dict = {int(i): idx for idx, i in enumerate(
f.read().strip().split("\n"))} # map node idx to 0-based index (start from 0)
with open(distance_df_filename, "r") as f:
f.readline() # omit the first line
reader = csv.reader(f)
for row in reader:
if len(row) != 3:
continue
i, j, distance = int(row[0]), int(row[1]), float(row[2])
adjacency_matrix_connectivity[id_dict[i], id_dict[j]] = 1
adjacency_matrix_connectivity[id_dict[j], id_dict[i]] = 1
adjacency_matrix_distance[id_dict[i],
id_dict[j]] = distance
adjacency_matrix_distance[id_dict[j],
id_dict[i]] = distance
return adjacency_matrix_connectivity, adjacency_matrix_distance
else:
# ids in distance file start from 0
with open(distance_df_filename, "r") as f:
f.readline()
reader = csv.reader(f)
for row in reader:
if len(row) != 3:
continue
i, j, distance = int(row[0]), int(row[1]), float(row[2])
adjacency_matrix_connectivity[i, j] = 1
adjacency_matrix_connectivity[j, i] = 1
adjacency_matrix_distance[i, j] = distance
adjacency_matrix_distance[j, i] = distance
return adjacency_matrix_connectivity, adjacency_matrix_distance
def generate_adj_pems07():
distance_df_filename, num_of_vertices = "datasets/raw_data/PEMS07/PEMS07.csv", 883
if os.path.exists(distance_df_filename.split(".", maxsplit=1)[0] + ".txt"):
id_filename = distance_df_filename.split(".", maxsplit=1)[0] + ".txt"
else:
id_filename = None
adj_mx, distance_mx = get_adjacency_matrix(
distance_df_filename, num_of_vertices, id_filename=id_filename)
# the self loop is missing
add_self_loop = False
if add_self_loop:
print("adding self loop to adjacency matrices.")
adj_mx = adj_mx + np.identity(adj_mx.shape[0])
distance_mx = distance_mx + np.identity(distance_mx.shape[0])
else:
print("kindly note that there is no self loop in adjacency matrices.")
with open("datasets/raw_data/PEMS07/adj_PEMS07.pkl", "wb") as f:
pickle.dump(adj_mx, f)
with open("datasets/raw_data/PEMS07/adj_PEMS07_distance.pkl", "wb") as f:
pickle.dump(distance_mx, f)

View File

@ -0,0 +1,113 @@
import json
import os
import shutil
import numpy as np
from generate_adj_mx import generate_adj_pems07 as generate_adj
# Hyperparameters
dataset_name = 'PEMS07'
data_file_path = f'datasets/raw_data/{dataset_name}/{dataset_name}.npz'
graph_file_path = f'datasets/raw_data/{dataset_name}/adj_{dataset_name}.pkl'
output_dir = f'datasets/{dataset_name}'
target_channel = [0] # Target traffic flow channel
add_time_of_day = True # Add time of day as a feature
add_day_of_week = True # Add day of the week as a feature
steps_per_day = 288 # Number of time steps per day
frequency = 1440 // steps_per_day
domain = 'traffic flow'
feature_description = [domain, 'time of day', 'day of week']
regular_settings = {
'INPUT_LEN': 12,
'OUTPUT_LEN': 12,
'TRAIN_VAL_TEST_RATIO': [0.6, 0.2, 0.2],
'NORM_EACH_CHANNEL': False,
'RESCALE': True,
'METRICS': ['MAE', 'RMSE', 'MAPE'],
'NULL_VAL': 0.0
}
def load_and_preprocess_data():
'''Load and preprocess raw data, selecting the specified channel(s).'''
data = np.load(data_file_path)['data']
data = data[..., target_channel]
print(f'Raw time series shape: {data.shape}')
return data
def add_temporal_features(data):
'''Add time of day and day of week as features to the data.'''
l, n, _ = data.shape
feature_list = [data]
if add_time_of_day:
time_of_day = np.array([i % steps_per_day / steps_per_day for i in range(l)])
time_of_day_tiled = np.tile(time_of_day, [1, n, 1]).transpose((2, 1, 0))
feature_list.append(time_of_day_tiled)
if add_day_of_week:
day_of_week = np.array([(i // steps_per_day) % 7 / 7 for i in range(l)])
day_of_week_tiled = np.tile(day_of_week, [1, n, 1]).transpose((2, 1, 0))
feature_list.append(day_of_week_tiled)
data_with_features = np.concatenate(feature_list, axis=-1) # L x N x C
return data_with_features
def save_data(data):
'''Save the preprocessed data to a binary file.'''
if not os.path.exists(output_dir):
os.makedirs(output_dir)
file_path = os.path.join(output_dir, 'data.dat')
fp = np.memmap(file_path, dtype='float32', mode='w+', shape=data.shape)
fp[:] = data[:]
fp.flush()
del fp
print(f'Data saved to {file_path}')
def save_graph():
'''Save the adjacency matrix to the output directory, generating it if necessary.'''
output_graph_path = os.path.join(output_dir, 'adj_mx.pkl')
if os.path.exists(graph_file_path):
shutil.copyfile(graph_file_path, output_graph_path)
else:
generate_adj()
shutil.copyfile(graph_file_path, output_graph_path)
print(f'Adjacency matrix saved to {output_graph_path}')
def save_description(data):
'''Save a description of the dataset to a JSON file.'''
description = {
'name': dataset_name,
'domain': domain,
'shape': data.shape,
'num_time_steps': data.shape[0],
'num_nodes': data.shape[1],
'num_features': data.shape[2],
'feature_description': feature_description,
'has_graph': graph_file_path is not None,
'frequency (minutes)': frequency,
'regular_settings': regular_settings
}
description_path = os.path.join(output_dir, 'desc.json')
with open(description_path, 'w') as f:
json.dump(description, f, indent=4)
print(f'Description saved to {description_path}')
print(description)
def main():
# Load and preprocess data
data = load_and_preprocess_data()
# Add temporal features
data_with_features = add_temporal_features(data)
# Save processed data
save_data(data_with_features)
# Copy or generate and save adjacency matrix
save_graph()
# Save dataset description
save_description(data_with_features)
if __name__ == '__main__':
main()

View File

@ -0,0 +1,84 @@
import csv
import os
import pickle
import numpy as np
def get_adjacency_matrix(distance_df_filename: str, num_of_vertices: int, id_filename: str = None) -> tuple:
"""Generate adjacency matrix.
Args:
distance_df_filename (str): path of the csv file contains edges information
num_of_vertices (int): number of vertices
id_filename (str, optional): id filename. Defaults to None.
Returns:
tuple: two adjacency matrix.
np.array: connectivity-based adjacency matrix A (A[i, j]=0 or A[i, j]=1)
np.array: distance-based adjacency matrix A
"""
if "npy" in distance_df_filename:
adj_mx = np.load(distance_df_filename)
return adj_mx, None
else:
adjacency_matrix_connectivity = np.zeros((int(num_of_vertices), int(
num_of_vertices)), dtype=np.float32)
adjacency_matrix_distance = np.zeros((int(num_of_vertices), int(num_of_vertices)),
dtype=np.float32)
if id_filename:
# the id in the distance file does not start from 0, so it needs to be remapped
with open(id_filename, "r") as f:
id_dict = {int(i): idx for idx, i in enumerate(
f.read().strip().split("\n"))} # map node idx to 0-based index (start from 0)
with open(distance_df_filename, "r") as f:
f.readline() # omit the first line
reader = csv.reader(f)
for row in reader:
if len(row) != 3:
continue
i, j, distance = int(row[0]), int(row[1]), float(row[2])
adjacency_matrix_connectivity[id_dict[i], id_dict[j]] = 1
adjacency_matrix_connectivity[id_dict[j], id_dict[i]] = 1
adjacency_matrix_distance[id_dict[i],
id_dict[j]] = distance
adjacency_matrix_distance[id_dict[j],
id_dict[i]] = distance
return adjacency_matrix_connectivity, adjacency_matrix_distance
else:
# ids in distance file start from 0
with open(distance_df_filename, "r") as f:
f.readline()
reader = csv.reader(f)
for row in reader:
if len(row) != 3:
continue
i, j, distance = int(row[0]), int(row[1]), float(row[2])
adjacency_matrix_connectivity[i, j] = 1
adjacency_matrix_connectivity[j, i] = 1
adjacency_matrix_distance[i, j] = distance
adjacency_matrix_distance[j, i] = distance
return adjacency_matrix_connectivity, adjacency_matrix_distance
def generate_adj_pems08():
distance_df_filename, num_of_vertices = "datasets/raw_data/PEMS08/PEMS08.csv", 170
if os.path.exists(distance_df_filename.split(".", maxsplit=1)[0] + ".txt"):
id_filename = distance_df_filename.split(".", maxsplit=1)[0] + ".txt"
else:
id_filename = None
adj_mx, distance_mx = get_adjacency_matrix(
distance_df_filename, num_of_vertices, id_filename=id_filename)
# the self loop is missing
add_self_loop = False
if add_self_loop:
print("adding self loop to adjacency matrices.")
adj_mx = adj_mx + np.identity(adj_mx.shape[0])
distance_mx = distance_mx + np.identity(distance_mx.shape[0])
else:
print("kindly note that there is no self loop in adjacency matrices.")
with open("datasets/raw_data/PEMS08/adj_PEMS08.pkl", "wb") as f:
pickle.dump(adj_mx, f)
with open("datasets/raw_data/PEMS08/adj_PEMS08_distance.pkl", "wb") as f:
pickle.dump(distance_mx, f)

View File

@ -0,0 +1,113 @@
import json
import os
import shutil
import numpy as np
from generate_adj_mx import generate_adj_pems08 as generate_adj
# Hyperparameters
dataset_name = 'PEMS08'
data_file_path = f'datasets/raw_data/{dataset_name}/{dataset_name}.npz'
graph_file_path = f'datasets/raw_data/{dataset_name}/adj_{dataset_name}.pkl'
output_dir = f'datasets/{dataset_name}'
target_channel = [0] # Target traffic flow channel
add_time_of_day = True # Add time of day as a feature
add_day_of_week = True # Add day of the week as a feature
steps_per_day = 288 # Number of time steps per day
frequency = 1440 // steps_per_day
domain = 'traffic flow'
feature_description = [domain, 'time of day', 'day of week']
regular_settings = {
'INPUT_LEN': 12,
'OUTPUT_LEN': 12,
'TRAIN_VAL_TEST_RATIO': [0.6, 0.2, 0.2],
'NORM_EACH_CHANNEL': False,
'RESCALE': True,
'METRICS': ['MAE', 'RMSE', 'MAPE'],
'NULL_VAL': 0.0
}
def load_and_preprocess_data():
'''Load and preprocess raw data, selecting the specified channel(s).'''
data = np.load(data_file_path)['data']
data = data[..., target_channel]
print(f'Raw time series shape: {data.shape}')
return data
def add_temporal_features(data):
'''Add time of day and day of week as features to the data.'''
l, n, _ = data.shape
feature_list = [data]
if add_time_of_day:
time_of_day = np.array([i % steps_per_day / steps_per_day for i in range(l)])
time_of_day_tiled = np.tile(time_of_day, [1, n, 1]).transpose((2, 1, 0))
feature_list.append(time_of_day_tiled)
if add_day_of_week:
day_of_week = np.array([(i // steps_per_day) % 7 / 7 for i in range(l)])
day_of_week_tiled = np.tile(day_of_week, [1, n, 1]).transpose((2, 1, 0))
feature_list.append(day_of_week_tiled)
data_with_features = np.concatenate(feature_list, axis=-1) # L x N x C
return data_with_features
def save_data(data):
'''Save the preprocessed data to a binary file.'''
if not os.path.exists(output_dir):
os.makedirs(output_dir)
file_path = os.path.join(output_dir, 'data.dat')
fp = np.memmap(file_path, dtype='float32', mode='w+', shape=data.shape)
fp[:] = data[:]
fp.flush()
del fp
print(f'Data saved to {file_path}')
def save_graph():
'''Save the adjacency matrix to the output directory, generating it if necessary.'''
output_graph_path = os.path.join(output_dir, 'adj_mx.pkl')
if os.path.exists(graph_file_path):
shutil.copyfile(graph_file_path, output_graph_path)
else:
generate_adj()
shutil.copyfile(graph_file_path, output_graph_path)
print(f'Adjacency matrix saved to {output_graph_path}')
def save_description(data):
'''Save a description of the dataset to a JSON file.'''
description = {
'name': dataset_name,
'domain': domain,
'shape': data.shape,
'num_time_steps': data.shape[0],
'num_nodes': data.shape[1],
'num_features': data.shape[2],
'feature_description': feature_description,
'has_graph': graph_file_path is not None,
'frequency (minutes)': frequency,
'regular_settings': regular_settings
}
description_path = os.path.join(output_dir, 'desc.json')
with open(description_path, 'w') as f:
json.dump(description, f, indent=4)
print(f'Description saved to {description_path}')
print(description)
def main():
# Load and preprocess data
data = load_and_preprocess_data()
# Add temporal features
data_with_features = add_temporal_features(data)
# Save processed data
save_data(data_with_features)
# Copy or generate and save adjacency matrix
save_graph()
# Save dataset description
save_description(data_with_features)
if __name__ == '__main__':
main()

View File

@ -0,0 +1,74 @@
import json
import os
import numpy as np
# Hyperparameters
dataset_name = 'Pulse'
data_file_path = f'datasets/raw_data/{dataset_name}/{dataset_name}.npy'
graph_file_path = None
output_dir = f'datasets/{dataset_name}'
target_channel = [0] # Target traffic flow channel
frequency = None
domain = 'simulated pulse data'
feature_description = [domain]
regular_settings = {
'INPUT_LEN': 336,
'OUTPUT_LEN': 336,
'TRAIN_VAL_TEST_RATIO': [0.7, 0.1, 0.2],
'NORM_EACH_CHANNEL': False,
'RESCALE': True,
'METRICS': ['MAE', 'RMSE', 'MAPE'],
'NULL_VAL': np.nan
}
def load_and_preprocess_data():
'''Load and preprocess raw data, selecting the specified channel(s).'''
data = np.load(data_file_path)
data = data[..., target_channel]
print(f'Raw time series shape: {data.shape}')
return data
def save_data(data):
'''Save the preprocessed data to a binary file.'''
if not os.path.exists(output_dir):
os.makedirs(output_dir)
file_path = os.path.join(output_dir, 'data.dat')
fp = np.memmap(file_path, dtype='float32', mode='w+', shape=data.shape)
fp[:] = data[:]
fp.flush()
del fp
print(f'Data saved to {file_path}')
def save_description(data):
'''Save a description of the dataset to a JSON file.'''
description = {
'name': dataset_name,
'domain': domain,
'shape': data.shape,
'num_time_steps': data.shape[0],
'num_nodes': data.shape[1],
'num_features': data.shape[2],
'feature_description': feature_description,
'has_graph': graph_file_path is not None,
'frequency (minutes)': frequency,
'settings': regular_settings
}
description_path = os.path.join(output_dir, 'desc.json')
with open(description_path, 'w') as f:
json.dump(description, f, indent=4)
print(f'Description saved to {description_path}')
print(description)
def main():
# Load and preprocess data
data = load_and_preprocess_data()
# Save processed data
save_data(data)
# Save dataset description
save_description(data)
if __name__ == '__main__':
main()

View File

@ -0,0 +1,33 @@
import os
import numpy as np
import torch
PROJECT_DIR = os.path.abspath(__file__ + '/../../../..')
os.chdir(PROJECT_DIR)
# hyper parameterts
duration = 20000 # time series length
min_interval = 30 # minimum interval between two pulses
max_interval = 30 # maximum interval between two pulses
def generate_pulse_sequence():
x = np.arange(0, duration, 1)
y = np.zeros_like(x)
current_time = 0
while current_time < duration:
pulse_interval = np.random.uniform(min_interval, max_interval)
pulse_width = 1
y[int(current_time):int(current_time + pulse_width)] = 1
current_time += pulse_interval + pulse_width
return x, y
# generate pulse sequence
time_points, pulse_sequence = generate_pulse_sequence()
# save pulse sequence
data = torch.Tensor(pulse_sequence).unsqueeze(-1).unsqueeze(-1).numpy()
np.save('datasets/raw_data/Pulse/Pulse.npy', data)

View File

@ -0,0 +1,136 @@
import json
import os
import pickle
import shutil
import numpy as np
import pandas as pd
# Hyperparameters
dataset_name = 'SD'
data_file_path = f'datasets/raw_data/{dataset_name}/{dataset_name}.h5'
graph_file_path = f'datasets/raw_data/{dataset_name}/adj_{dataset_name}.npy'
meta_file_path = f'datasets/raw_data/{dataset_name}/meta_{dataset_name}.csv'
output_dir = f'datasets/{dataset_name}'
target_channel = [0] # Target traffic flow channel
add_time_of_day = True # Add time of day as a feature
add_day_of_week = True # Add day of the week as a feature
add_day_of_month = False # Add day of the month as a feature
add_day_of_year = False # Add day of the year as a feature
steps_per_day = 96 # Number of time steps per day
frequency = 1440 // steps_per_day
domain = 'traffic flow'
feature_description = [domain, 'time of day', 'day of week']
regular_settings = {
'INPUT_LEN': 12,
'OUTPUT_LEN': 12,
'TRAIN_VAL_TEST_RATIO': [0.6, 0.2, 0.2],
'NORM_EACH_CHANNEL': False,
'RESCALE': True,
'METRICS': ['MAE', 'RMSE', 'MAPE'],
'NULL_VAL': 0.0
}
def load_and_preprocess_data():
'''Load and preprocess raw data, selecting the specified channel(s).'''
df = pd.read_hdf(data_file_path)
data = np.expand_dims(df.values, axis=-1)
data = data[..., target_channel]
print(f'Raw time series shape: {data.shape}')
return data, df
def add_temporal_features(data, df):
'''Add time of day and day of week as features to the data.'''
_, n, _ = data.shape
feature_list = [data]
if add_time_of_day:
time_of_day = (df.index.values - df.index.values.astype('datetime64[D]')) / np.timedelta64(1, 'D')
time_of_day_tiled = np.tile(time_of_day, [1, n, 1]).transpose((2, 1, 0))
feature_list.append(time_of_day_tiled)
if add_day_of_week:
day_of_week = df.index.dayofweek / 7
day_of_week_tiled = np.tile(day_of_week, [1, n, 1]).transpose((2, 1, 0))
feature_list.append(day_of_week_tiled)
if add_day_of_month:
# numerical day_of_month
day_of_month = (df.index.day - 1 ) / 31 # df.index.day starts from 1. We need to minus 1 to make it start from 0.
day_of_month_tiled = np.tile(day_of_month, [1, n, 1]).transpose((2, 1, 0))
feature_list.append(day_of_month_tiled)
if add_day_of_year:
# numerical day_of_year
day_of_year = (df.index.dayofyear - 1) / 366 # df.index.month starts from 1. We need to minus 1 to make it start from 0.
day_of_year_tiled = np.tile(day_of_year, [1, n, 1]).transpose((2, 1, 0))
feature_list.append(day_of_year_tiled)
data_with_features = np.concatenate(feature_list, axis=-1) # L x N x C
return data_with_features
def save_data(data):
'''Save the preprocessed data to a binary file.'''
if not os.path.exists(output_dir):
os.makedirs(output_dir)
file_path = os.path.join(output_dir, 'data.dat')
fp = np.memmap(file_path, dtype='float32', mode='w+', shape=data.shape)
fp[:] = data[:]
fp.flush()
del fp
print(f'Data saved to {file_path}')
def save_graph():
'''Save the adjacency matrix to the output directory.'''
output_graph_path = os.path.join(output_dir, 'adj_mx.pkl')
adj_mx = np.load(graph_file_path)
with open(output_dir + '/adj_mx.pkl', 'wb') as f:
pickle.dump(adj_mx, f)
print(f'Adjacency matrix saved to {output_graph_path}')
def save_meta_data():
'''Save the meta data to the output directory'''
output_meta_data_path = os.path.join(output_dir, 'meta.csv')
shutil.copyfile(meta_file_path, output_meta_data_path)
def save_description(data):
'''Save a description of the dataset to a JSON file.'''
description = {
'name': dataset_name,
'domain': domain,
'shape': data.shape,
'num_time_steps': data.shape[0],
'num_nodes': data.shape[1],
'num_features': data.shape[2],
'feature_description': feature_description,
'has_graph': graph_file_path is not None,
'frequency (minutes)': frequency,
'regular_settings': regular_settings
}
description_path = os.path.join(output_dir, 'desc.json')
with open(description_path, 'w') as f:
json.dump(description, f, indent=4)
print(f'Description saved to {description_path}')
print(description)
def main():
# Load and preprocess data
data, df = load_and_preprocess_data()
# Add temporal features
data_with_features = add_temporal_features(data, df)
# Save processed data
save_data(data_with_features)
# Copy and save adjacency matrix
save_graph()
# Copy and save meta data
save_meta_data()
# Save dataset description
save_description(data_with_features)
if __name__ == '__main__':
main()

View File

@ -0,0 +1,120 @@
import json
import os
import numpy as np
import pandas as pd
# Hyperparameters
dataset_name = 'Traffic'
data_file_path = f'datasets/raw_data/{dataset_name}/{dataset_name}.csv'
graph_file_path = None
output_dir = f'datasets/{dataset_name}'
target_channel = [0] # Target traffic flow channel
add_time_of_day = True # Add time of day as a feature
add_day_of_week = True # Add day of the week as a feature
add_day_of_month = True # Add day of the month as a feature
add_day_of_year = True # Add day of the year as a feature
steps_per_day = 24 # Number of time steps per day
frequency = 1440 // steps_per_day
domain = 'road occupancy rates'
feature_description = [domain, 'time of day', 'day of week', 'day of month', 'day of year']
regular_settings = {
'INPUT_LEN': 336,
'OUTPUT_LEN': 336,
'TRAIN_VAL_TEST_RATIO': [0.7, 0.1, 0.2],
'NORM_EACH_CHANNEL': True,
'RESCALE': False,
'METRICS': ['MAE', 'MSE'],
'NULL_VAL': np.nan
}
def load_and_preprocess_data():
'''Load and preprocess raw data, selecting the specified channel(s).'''
df = pd.read_csv(data_file_path)
df_index = pd.to_datetime(df['date'].values, format='%Y-%m-%d %H:%M:%S').to_numpy()
df = df[df.columns[1:]]
df.index = df_index
data = np.expand_dims(df.values, axis=-1)
data = data[..., target_channel]
print(f'Raw time series shape: {data.shape}')
return data, df
def add_temporal_features(data, df):
'''Add time of day and day of week as features to the data.'''
_, n, _ = data.shape
feature_list = [data]
if add_time_of_day:
# numerical time_of_day
tod = (
df.index.values - df.index.values.astype('datetime64[D]')) / np.timedelta64(1, 'D')
tod_tiled = np.tile(tod, [1, n, 1]).transpose((2, 1, 0))
feature_list.append(tod_tiled)
if add_day_of_week:
# numerical day_of_week
dow = df.index.dayofweek / 7
dow_tiled = np.tile(dow, [1, n, 1]).transpose((2, 1, 0))
feature_list.append(dow_tiled)
if add_day_of_month:
# numerical day_of_month
dom = (df.index.day - 1) / 31 # df.index.day starts from 1. We need to minus 1 to make it start from 0.
dom_tiled = np.tile(dom, [1, n, 1]).transpose((2, 1, 0))
feature_list.append(dom_tiled)
if add_day_of_year:
# numerical day_of_year
doy = (df.index.dayofyear - 1) / 366 # df.index.month starts from 1. We need to minus 1 to make it start from 0.
doy_tiled = np.tile(doy, [1, n, 1]).transpose((2, 1, 0))
feature_list.append(doy_tiled)
data_with_features = np.concatenate(feature_list, axis=-1) # L x N x C
return data_with_features
def save_data(data):
'''Save the preprocessed data to a binary file.'''
if not os.path.exists(output_dir):
os.makedirs(output_dir)
file_path = os.path.join(output_dir, 'data.dat')
fp = np.memmap(file_path, dtype='float32', mode='w+', shape=data.shape)
fp[:] = data[:]
fp.flush()
del fp
print(f'Data saved to {file_path}')
def save_description(data):
'''Save a description of the dataset to a JSON file.'''
description = {
'name': dataset_name,
'domain': domain,
'shape': data.shape,
'num_time_steps': data.shape[0],
'num_nodes': data.shape[1],
'num_features': data.shape[2],
'feature_description': feature_description,
'has_graph': graph_file_path is not None,
'frequency (minutes)': frequency,
'regular_settings': regular_settings
}
description_path = os.path.join(output_dir, 'desc.json')
with open(description_path, 'w') as f:
json.dump(description, f, indent=4)
print(f'Description saved to {description_path}')
print(description)
def main():
# Load and preprocess data
data, df = load_and_preprocess_data()
# Add temporal features
data_with_features = add_temporal_features(data, df)
# Save processed data
save_data(data_with_features)
# Save dataset description
save_description(data_with_features)
if __name__ == '__main__':
main()

View File

@ -0,0 +1,120 @@
import json
import os
import numpy as np
import pandas as pd
# Hyperparameters
dataset_name = 'Weather'
data_file_path = f'datasets/raw_data/{dataset_name}/{dataset_name}.csv'
graph_file_path = None
output_dir = f'datasets/{dataset_name}'
target_channel = [0] # Target traffic flow channel
add_time_of_day = True # Add time of day as a feature
add_day_of_week = True # Add day of the week as a feature
add_day_of_month = True # Add day of the month as a feature
add_day_of_year = True # Add day of the year as a feature
steps_per_day = 144 # Number of time steps per day
frequency = 1440 // steps_per_day
domain = 'weather'
feature_description = [domain, 'time of day', 'day of week', 'day of month', 'day of year']
regular_settings = {
'INPUT_LEN': 336,
'OUTPUT_LEN': 336,
'TRAIN_VAL_TEST_RATIO': [0.7, 0.1, 0.2],
'NORM_EACH_CHANNEL': True,
'RESCALE': False,
'METRICS': ['MAE', 'MSE'],
'NULL_VAL': np.nan
}
def load_and_preprocess_data():
'''Load and preprocess raw data, selecting the specified channel(s).'''
df = pd.read_csv(data_file_path)
df_index = pd.to_datetime(df['date'].values, format='%Y-%m-%d %H:%M:%S').to_numpy()
df = df[df.columns[1:]]
df.index = df_index
data = np.expand_dims(df.values, axis=-1)
data = data[..., target_channel]
print(f'Raw time series shape: {data.shape}')
return data, df
def add_temporal_features(data, df):
'''Add time of day and day of week as features to the data.'''
l, n, _ = data.shape
feature_list = [data]
if add_time_of_day:
# numerical time_of_day
tod = [i % steps_per_day / steps_per_day for i in range(l)]
tod = np.array(tod)
tod_tiled = np.tile(tod, [1, n, 1]).transpose((2, 1, 0))
feature_list.append(tod_tiled)
if add_day_of_week:
# numerical day_of_week
dow = df.index.dayofweek / 7
dow_tiled = np.tile(dow, [1, n, 1]).transpose((2, 1, 0))
feature_list.append(dow_tiled)
if add_day_of_month:
# numerical day_of_month
dom = (df.index.day - 1) / 31 # df.index.day starts from 1. We need to minus 1 to make it start from 0.
dom_tiled = np.tile(dom, [1, n, 1]).transpose((2, 1, 0))
feature_list.append(dom_tiled)
if add_day_of_year:
# numerical day_of_year
doy = (df.index.dayofyear - 1) / 366 # df.index.month starts from 1. We need to minus 1 to make it start from 0.
doy_tiled = np.tile(doy, [1, n, 1]).transpose((2, 1, 0))
feature_list.append(doy_tiled)
data_with_features = np.concatenate(feature_list, axis=-1) # L x N x C
return data_with_features
def save_data(data):
'''Save the preprocessed data to a binary file.'''
if not os.path.exists(output_dir):
os.makedirs(output_dir)
file_path = os.path.join(output_dir, 'data.dat')
fp = np.memmap(file_path, dtype='float32', mode='w+', shape=data.shape)
fp[:] = data[:]
fp.flush()
del fp
print(f'Data saved to {file_path}')
def save_description(data):
'''Save a description of the dataset to a JSON file.'''
description = {
'name': dataset_name,
'domain': domain,
'shape': data.shape,
'num_time_steps': data.shape[0],
'num_nodes': data.shape[1],
'num_features': data.shape[2],
'feature_description': feature_description,
'has_graph': graph_file_path is not None,
'frequency (minutes)': frequency,
'regular_settings': regular_settings
}
description_path = os.path.join(output_dir, 'desc.json')
with open(description_path, 'w') as f:
json.dump(description, f, indent=4)
print(f'Description saved to {description_path}')
print(description)
def main():
# Load and preprocess data
data, df = load_and_preprocess_data()
# Add temporal features
data_with_features = add_temporal_features(data, df)
# Save processed data
save_data(data_with_features)
# Save dataset description
save_description(data_with_features)
if __name__ == '__main__':
main()

30
scripts/data_preparation/run.sh Executable file
View File

@ -0,0 +1,30 @@
#!/bin/bash
# spatial-temporal forecasting
python scripts/data_preparation/METR-LA/generate_training_data.py
python scripts/data_preparation/PEMS-BAY/generate_training_data.py
python scripts/data_preparation/PEMS03/generate_training_data.py
python scripts/data_preparation/PEMS04/generate_training_data.py
python scripts/data_preparation/PEMS07/generate_training_data.py
python scripts/data_preparation/PEMS08/generate_training_data.py
# long-term time series forecasting
python scripts/data_preparation/ETTh1/generate_training_data.py
python scripts/data_preparation/ETTh2/generate_training_data.py
python scripts/data_preparation/ETTm1/generate_training_data.py
python scripts/data_preparation/ETTm2/generate_training_data.py
python scripts/data_preparation/Electricity/generate_training_data.py
python scripts/data_preparation/Weather/generate_training_data.py
python scripts/data_preparation/ExchangeRate/generate_training_data.py
python scripts/data_preparation/Illness/generate_training_data.py
python scripts/data_preparation/Traffic/generate_training_data.py
# large-scale mts forecasting
python scripts/data_preparation/CA/generate_training_data.py
python scripts/data_preparation/GBA/generate_training_data.py
python scripts/data_preparation/GLA/generate_training_data.py
python scripts/data_preparation/SD/generate_training_data.py
python scripts/data_preparation/BeijingAirQuality/generate_training_data.py
python scripts/data_preparation/Gaussian/generate_training_data.py
python scripts/data_preparation/Pulse/generate_training_data.py

File diff suppressed because one or more lines are too long

File diff suppressed because one or more lines are too long

View File

@ -0,0 +1,251 @@
{
"cells": [
{
"cell_type": "code",
"execution_count": 1,
"metadata": {},
"outputs": [],
"source": [
"import os\n",
"import sys\n",
"import math\n",
"import torch\n",
"\n",
"PROJECT_DIR = os.path.abspath(os.path.abspath('') + \"/../..\")\n",
"os.chdir(PROJECT_DIR)\n",
"\n",
"import numpy as np\n",
"from tqdm import tqdm\n",
"from basicts.data import TimeSeriesForecastingDataset\n",
"from basicts.utils import get_regular_settings\n",
"from basicts.scaler import ZScoreScaler\n",
"\n",
"\n",
"metric = \"cosine\" # metric used to calculate the similarity.\n",
"device = torch.device(\"cuda:0\" if torch.cuda.is_available() else \"cpu\")\n",
"\n",
"DATA_NAME = \"METR-LA\"\n",
"DATA_NAME = \"ETTh1\"\n",
"BATCH_SIZE = 8\n",
"regular_settings = get_regular_settings(DATA_NAME)\n",
"INPUT_LEN = regular_settings['INPUT_LEN'] # Length of input sequence\n",
"OUTPUT_LEN = regular_settings['OUTPUT_LEN'] # Length of output sequence\n",
"TRAIN_VAL_TEST_RATIO = regular_settings['TRAIN_VAL_TEST_RATIO'] # Train/Validation/Test split ratios\n",
"RESCALE = regular_settings['RESCALE'] # Whether to rescale the data\n",
"NULL_VAL = regular_settings['NULL_VAL'] # Null value in the data\n",
"NORM_EACH_CHANNEL = regular_settings['NORM_EACH_CHANNEL'] # Whether to normalize each channel\n"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"## utilities"
]
},
{
"cell_type": "code",
"execution_count": 2,
"metadata": {},
"outputs": [],
"source": [
"# similarity computation\n",
"def cosine_similarity(x, y):\n",
" # denominator\n",
" l2_x = torch.norm(x, dim=2, p=2) + 1e-7\n",
" l2_y = torch.norm(y, dim=2, p=2) + 1e-7\n",
" l2_n = torch.matmul(l2_x.unsqueeze(dim=2), l2_y.unsqueeze(dim=2).transpose(1, 2))\n",
" # numerator\n",
" l2_d = torch.matmul(x, y.transpose(1, 2))\n",
" return l2_d / l2_n\n",
"\n",
"def get_similarity_matrix(data, metric):\n",
" if metric == \"cosine\":\n",
" sim = cosine_similarity(data, data)\n",
" elif metric == \"mse\":\n",
" sim = torch.cdist(data, data, p=2)\n",
" elif metric == \"mae\":\n",
" sim = torch.cdist(data, data, p=1)\n",
" else:\n",
" raise NotImplementedError\n",
" return sim"
]
},
{
"cell_type": "code",
"execution_count": 3,
"metadata": {},
"outputs": [],
"source": [
"dataset_param = {\n",
" 'dataset_name': DATA_NAME,\n",
" 'train_val_test_ratio': TRAIN_VAL_TEST_RATIO,\n",
" 'input_len': INPUT_LEN,\n",
" 'output_len': OUTPUT_LEN,\n",
"}\n",
"# get dataloader\n",
"dataset = TimeSeriesForecastingDataset(**dataset_param, mode='train')\n",
"# the whole training data\n",
"dataloader = torch.utils.data.DataLoader(dataset, batch_size=BATCH_SIZE, shuffle=False, num_workers=3)\n",
"\n",
"scaler_param = {\n",
" 'dataset_name': DATA_NAME,\n",
" 'train_ratio': TRAIN_VAL_TEST_RATIO[0],\n",
" 'norm_each_channel': NORM_EACH_CHANNEL,\n",
" 'rescale': RESCALE,\n",
"}\n",
"scaler = ZScoreScaler(**scaler_param)\n"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"## Generate Similarity Matrix"
]
},
{
"cell_type": "code",
"execution_count": 4,
"metadata": {},
"outputs": [
{
"name": "stderr",
"output_type": "stream",
"text": [
"100%|██████████| 997/997 [00:02<00:00, 412.47it/s]\n"
]
}
],
"source": [
"# get similarity matrices\n",
"\n",
"# inference pipeline for a given dataloader\n",
"history_adjs_all = []\n",
"future_adjs_all = []\n",
"def inference(dataloader):\n",
" for batch in tqdm(dataloader):\n",
" future_data, history_data = batch['target'], batch['inputs']\n",
" future_data = scaler.transform(future_data)\n",
" history_data = scaler.transform(history_data)\n",
" history_data = history_data[..., 0].transpose(1, 2) # batch_size, num_nodes, history_seq_len\n",
" future_data = future_data[..., 0].transpose(1, 2) # batch_size, num_nodes, future_seq_len\n",
" history_adjs = get_similarity_matrix(history_data, metric) # batch_size, num_nodes, num_nodes\n",
" future_adjs = get_similarity_matrix(future_data, metric) # batch_size, num_nodes, num_nodes\n",
" history_adjs_all.append(history_adjs)\n",
" future_adjs_all.append(future_adjs)\n",
"# get similarity matrices\n",
"# for mode in [\"valid\"]:\n",
"for mode in [\"train\"]:\n",
" inference(dataloader)\n"
]
},
{
"cell_type": "code",
"execution_count": 5,
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"torch.Size([7969, 7, 7])\n"
]
}
],
"source": [
"# get spatial indistinguishability ratio\n",
"history_similarity = torch.cat(history_adjs_all, dim=0).detach().cpu() # num_samples, num_modes, num_nodes\n",
"future_similarity = torch.cat(future_adjs_all, dim=0).detach().cpu() # num_samples, num_modes, num_nodes\n",
"L, N, N = future_similarity.shape\n",
"print(future_similarity.shape)"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"## Get Spatial Indistinguishability Ratio"
]
},
{
"cell_type": "code",
"execution_count": 6,
"metadata": {},
"outputs": [],
"source": [
"e_u = 0.9\n",
"e_l = 0.4\n",
"\n",
"history_similarity_filtered = torch.where(history_similarity > e_u, torch.ones_like(history_similarity), torch.zeros_like(history_similarity))\n",
"future_similarity_filtered = torch.where(future_similarity < e_l, torch.ones_like(future_similarity), torch.zeros_like(future_similarity))\n",
"overlap = history_similarity_filtered * future_similarity_filtered\n"
]
},
{
"cell_type": "code",
"execution_count": 7,
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"tensor(3.8568)\n"
]
}
],
"source": [
"# overlap ratio\n",
"overlap_ratio = overlap.sum() / (L * N * N)\n",
"print(overlap_ratio * 1000)"
]
},
{
"cell_type": "code",
"execution_count": 8,
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"tensor(15.7748)\n"
]
}
],
"source": [
"# indistinguishability ratio\n",
"indistinguishability_ratio = overlap.sum() / history_similarity_filtered.sum()\n",
"print(indistinguishability_ratio * 1000)\n"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": []
}
],
"metadata": {
"kernelspec": {
"display_name": "BasicTS",
"language": "python",
"name": "python3"
},
"language_info": {
"codemirror_mode": {
"name": "ipython",
"version": 3
},
"file_extension": ".py",
"mimetype": "text/x-python",
"name": "python",
"nbconvert_exporter": "python",
"pygments_lexer": "ipython3",
"version": "3.11.11"
},
"orig_nbformat": 4
},
"nbformat": 4,
"nbformat_minor": 2
}

File diff suppressed because one or more lines are too long

487
scripts/dataset_analysis.py Normal file
View File

@ -0,0 +1,487 @@
#!/usr/bin/env python3
# -*- coding: utf-8 -*-
"""
数据集分析脚本
用于读取BasicTS项目中的数据集并生成详细的报告
包括节点/边数量时间频率缺失值率空间覆盖密度等分析
"""
import os
import json
import numpy as np
import pandas as pd
import pickle
from pathlib import Path
from typing import Dict, List, Tuple, Optional
import matplotlib.pyplot as plt
import seaborn as sns
from datetime import datetime, timedelta
import warnings
warnings.filterwarnings('ignore')
# 设置中文字体
plt.rcParams['font.sans-serif'] = ['SimHei', 'DejaVu Sans']
plt.rcParams['axes.unicode_minus'] = False
class DatasetAnalyzer:
"""数据集分析器"""
def __init__(self, datasets_dir: str = "datasets"):
"""
初始化数据集分析器
Args:
datasets_dir: 数据集目录路径
"""
self.datasets_dir = Path(datasets_dir)
self.datasets_info = {}
self.analysis_results = {}
def get_available_datasets(self) -> List[str]:
"""获取可用的数据集列表"""
datasets = []
for item in self.datasets_dir.iterdir():
if item.is_dir() and (item / "desc.json").exists():
datasets.append(item.name)
return sorted(datasets)
def load_dataset_description(self, dataset_name: str) -> Dict:
"""加载数据集描述文件"""
desc_path = self.datasets_dir / dataset_name / "desc.json"
with open(desc_path, 'r', encoding='utf-8') as f:
return json.load(f)
def load_dataset_data(self, dataset_name: str) -> np.ndarray:
"""加载数据集数据"""
desc = self.load_dataset_description(dataset_name)
data_path = self.datasets_dir / dataset_name / "data.dat"
# 使用memmap加载大数据文件
data = np.memmap(data_path, dtype='float32', mode='r',
shape=tuple(desc['shape']))
return data.copy() # 复制到内存中
def load_adjacency_matrix(self, dataset_name: str) -> Optional[np.ndarray]:
"""加载邻接矩阵(如果存在)"""
adj_path = self.datasets_dir / dataset_name / "adj_mx.pkl"
if adj_path.exists():
with open(adj_path, 'rb') as f:
adj_data = pickle.load(f)
# 处理不同的邻接矩阵格式
if isinstance(adj_data, tuple):
return adj_data[0] # 通常第一个元素是邻接矩阵
elif isinstance(adj_data, dict):
return adj_data.get('adj_mx', adj_data.get('adj', None))
else:
return adj_data
return None
def analyze_missing_values(self, data: np.ndarray, null_val: float = 0.0) -> Dict:
"""分析缺失值"""
# 计算缺失值
if np.isnan(null_val):
missing_mask = np.isnan(data)
else:
missing_mask = (data == null_val)
total_elements = data.size
missing_elements = np.sum(missing_mask)
missing_rate = (missing_elements / total_elements) * 100
# 按时间步分析缺失值
missing_by_time = np.sum(missing_mask, axis=(1, 2)) if data.ndim == 3 else np.sum(missing_mask, axis=1)
missing_by_node = np.sum(missing_mask, axis=(0, 2)) if data.ndim == 3 else np.sum(missing_mask, axis=0)
return {
'total_missing_rate': missing_rate,
'missing_elements': missing_elements,
'total_elements': total_elements,
'missing_by_time': missing_by_time,
'missing_by_node': missing_by_node,
'max_missing_time': np.max(missing_by_time),
'max_missing_node': np.max(missing_by_node) if data.ndim == 3 else 0
}
def analyze_temporal_continuity(self, data: np.ndarray, freq_minutes: int) -> Dict:
"""分析时间连续性"""
# 计算时间跨度
total_time_steps = data.shape[0]
total_hours = (total_time_steps * freq_minutes) / 60
total_days = total_hours / 24
# 计算数据密度(非零数据点比例)
non_zero_ratio = np.sum(data != 0) / data.size
return {
'total_time_steps': total_time_steps,
'frequency_minutes': freq_minutes,
'total_hours': total_hours,
'total_days': total_days,
'data_density': non_zero_ratio
}
def analyze_spatial_coverage(self, data: np.ndarray, adj_matrix: Optional[np.ndarray] = None) -> Dict:
"""分析空间覆盖"""
if data.ndim == 3:
num_nodes = data.shape[1]
num_features = data.shape[2]
else:
num_nodes = data.shape[1]
num_features = 1
# 计算邻接矩阵信息
edge_info = {}
if adj_matrix is not None:
num_edges = np.sum(adj_matrix > 0)
edge_density = num_edges / (num_nodes * num_nodes)
avg_degree = np.mean(np.sum(adj_matrix > 0, axis=1))
edge_info = {
'num_edges': int(num_edges),
'edge_density': edge_density,
'avg_degree': avg_degree,
'max_degree': int(np.max(np.sum(adj_matrix > 0, axis=1))),
'min_degree': int(np.min(np.sum(adj_matrix > 0, axis=1)))
}
return {
'num_nodes': num_nodes,
'num_features': num_features,
**edge_info
}
def analyze_dataset(self, dataset_name: str) -> Dict:
"""分析单个数据集"""
print(f"正在分析数据集: {dataset_name}")
# 加载数据
desc = self.load_dataset_description(dataset_name)
data = self.load_dataset_data(dataset_name)
adj_matrix = self.load_adjacency_matrix(dataset_name)
# 基础信息
basic_info = {
'name': desc['name'],
'domain': desc['domain'],
'shape': desc['shape'],
'has_graph': desc.get('has_graph', False),
'frequency_minutes': desc.get('frequency (minutes)', None)
}
# 缺失值分析
null_val = desc.get('regular_settings', {}).get('NULL_VAL', 0.0)
missing_analysis = self.analyze_missing_values(data, null_val)
# 时间连续性分析
temporal_analysis = self.analyze_temporal_continuity(data, basic_info['frequency_minutes'])
# 空间覆盖分析
spatial_analysis = self.analyze_spatial_coverage(data, adj_matrix)
return {
'basic_info': basic_info,
'missing_analysis': missing_analysis,
'temporal_analysis': temporal_analysis,
'spatial_analysis': spatial_analysis,
'description': desc
}
def analyze_all_datasets(self) -> Dict:
"""分析所有数据集"""
datasets = self.get_available_datasets()
print(f"发现 {len(datasets)} 个数据集: {datasets}")
for dataset_name in datasets:
try:
self.analysis_results[dataset_name] = self.analyze_dataset(dataset_name)
except Exception as e:
print(f"分析数据集 {dataset_name} 时出错: {e}")
continue
return self.analysis_results
def generate_summary_report(self) -> str:
"""生成汇总报告"""
if not self.analysis_results:
return "没有可用的分析结果"
report = []
report.append("=" * 80)
report.append("BasicTS 数据集分析报告")
report.append("=" * 80)
report.append(f"生成时间: {datetime.now().strftime('%Y-%m-%d %H:%M:%S')}")
report.append(f"分析数据集数量: {len(self.analysis_results)}")
report.append("")
# 数据集概览表
report.append("数据集概览:")
report.append("-" * 80)
report.append(f"{'数据集名称':<15} {'领域':<20} {'时间步数':<10} {'节点数':<8} {'特征数':<8} {'频率(分钟)':<12} {'缺失值率(%)':<12}")
report.append("-" * 80)
for name, result in self.analysis_results.items():
basic = result['basic_info']
missing = result['missing_analysis']
spatial = result['spatial_analysis']
report.append(f"{name:<15} {basic['domain']:<20} {basic['shape'][0]:<10} "
f"{spatial['num_nodes']:<8} {spatial['num_features']:<8} "
f"{basic['frequency_minutes']:<12} {missing['total_missing_rate']:<12.3f}")
report.append("")
# 详细分析
for name, result in self.analysis_results.items():
report.append(f"数据集: {name}")
report.append("-" * 40)
basic = result['basic_info']
missing = result['missing_analysis']
temporal = result['temporal_analysis']
spatial = result['spatial_analysis']
report.append(f"领域: {basic['domain']}")
report.append(f"数据形状: {basic['shape']}")
report.append(f"时间频率: {basic['frequency_minutes']} 分钟")
report.append(f"时间跨度: {temporal['total_days']:.1f} 天 ({temporal['total_hours']:.1f} 小时)")
report.append(f"节点数量: {spatial['num_nodes']}")
report.append(f"特征数量: {spatial['num_features']}")
if spatial.get('num_edges'):
report.append(f"边数量: {spatial['num_edges']}")
report.append(f"边密度: {spatial['edge_density']:.4f}")
report.append(f"平均度数: {spatial['avg_degree']:.2f}")
report.append(f"缺失值率: {missing['total_missing_rate']:.3f}%")
report.append(f"数据密度: {temporal['data_density']:.3f}")
report.append("")
return "\n".join(report)
def generate_comparative_analysis(self) -> str:
"""生成对比分析报告"""
if not self.analysis_results:
return "没有可用的分析结果"
report = []
report.append("=" * 80)
report.append("数据集对比分析")
report.append("=" * 80)
report.append("")
# 按领域分组
domains = {}
for name, result in self.analysis_results.items():
domain = result['basic_info']['domain']
if domain not in domains:
domains[domain] = []
domains[domain].append((name, result))
for domain, datasets in domains.items():
report.append(f"领域: {domain}")
report.append("-" * 40)
# 该领域的数据集统计
missing_rates = [d[1]['missing_analysis']['total_missing_rate'] for d in datasets]
node_counts = [d[1]['spatial_analysis']['num_nodes'] for d in datasets]
time_steps = [d[1]['basic_info']['shape'][0] for d in datasets]
report.append(f"数据集数量: {len(datasets)}")
report.append(f"平均缺失值率: {np.mean(missing_rates):.3f}%")
report.append(f"缺失值率范围: {min(missing_rates):.3f}% - {max(missing_rates):.3f}%")
report.append(f"平均节点数: {np.mean(node_counts):.1f}")
report.append(f"节点数范围: {min(node_counts)} - {max(node_counts)}")
report.append(f"平均时间步数: {np.mean(time_steps):.0f}")
report.append("")
# 空间覆盖密度分析
report.append("空间覆盖密度分析:")
report.append("-" * 40)
spatial_datasets = [(name, result) for name, result in self.analysis_results.items()
if result['spatial_analysis'].get('num_edges')]
if spatial_datasets:
for name, result in spatial_datasets:
spatial = result['spatial_analysis']
report.append(f"{name}: {spatial['num_nodes']} 个节点, {spatial['num_edges']} 条边, "
f"密度 {spatial['edge_density']:.4f}, 平均度数 {spatial['avg_degree']:.2f}")
else:
report.append("没有发现包含图结构的数据集")
report.append("")
# 时间连续性分析
report.append("时间连续性分析:")
report.append("-" * 40)
temporal_data = []
for name, result in self.analysis_results.items():
temporal = result['temporal_analysis']
temporal_data.append({
'name': name,
'days': temporal['total_days'],
'density': temporal['data_density'],
'frequency': temporal['frequency_minutes']
})
# 按时间跨度排序
temporal_data.sort(key=lambda x: x['days'], reverse=True)
for data in temporal_data:
report.append(f"{data['name']}: {data['days']:.1f} 天, "
f"数据密度 {data['density']:.3f}, "
f"频率 {data['frequency']} 分钟")
return "\n".join(report)
def save_reports(self, output_dir: str = "analysis_reports"):
"""保存分析报告"""
output_path = Path(output_dir)
output_path.mkdir(exist_ok=True)
# 保存汇总报告
summary_report = self.generate_summary_report()
with open(output_path / "summary_report.txt", 'w', encoding='utf-8') as f:
f.write(summary_report)
# 保存对比分析报告
comparative_report = self.generate_comparative_analysis()
with open(output_path / "comparative_analysis.txt", 'w', encoding='utf-8') as f:
f.write(comparative_report)
# 保存详细JSON报告
with open(output_path / "detailed_analysis.json", 'w', encoding='utf-8') as f:
json.dump(self.analysis_results, f, indent=2, ensure_ascii=False, default=str)
print(f"报告已保存到目录: {output_path}")
def create_visualizations(self, output_dir: str = "analysis_reports"):
"""创建可视化图表"""
if not self.analysis_results:
print("没有可用的分析结果")
return
output_path = Path(output_dir)
output_path.mkdir(exist_ok=True)
# 设置图表样式
plt.style.use('seaborn-v0_8')
# 1. 缺失值率对比
fig, ax = plt.subplots(figsize=(12, 6))
names = list(self.analysis_results.keys())
missing_rates = [self.analysis_results[name]['missing_analysis']['total_missing_rate']
for name in names]
bars = ax.bar(names, missing_rates, color='skyblue', alpha=0.7)
ax.set_title('各数据集缺失值率对比', fontsize=14, fontweight='bold')
ax.set_xlabel('数据集名称')
ax.set_ylabel('缺失值率 (%)')
ax.tick_params(axis='x', rotation=45)
# 添加数值标签
for bar, rate in zip(bars, missing_rates):
ax.text(bar.get_x() + bar.get_width()/2, bar.get_height() + 0.1,
f'{rate:.2f}%', ha='center', va='bottom')
plt.tight_layout()
plt.savefig(output_path / "missing_rates_comparison.png", dpi=300, bbox_inches='tight')
plt.close()
# 2. 节点数量对比
fig, ax = plt.subplots(figsize=(12, 6))
node_counts = [self.analysis_results[name]['spatial_analysis']['num_nodes']
for name in names]
bars = ax.bar(names, node_counts, color='lightgreen', alpha=0.7)
ax.set_title('各数据集节点数量对比', fontsize=14, fontweight='bold')
ax.set_xlabel('数据集名称')
ax.set_ylabel('节点数量')
ax.tick_params(axis='x', rotation=45)
# 添加数值标签
for bar, count in zip(bars, node_counts):
ax.text(bar.get_x() + bar.get_width()/2, bar.get_height() + max(node_counts)*0.01,
f'{count}', ha='center', va='bottom')
plt.tight_layout()
plt.savefig(output_path / "node_counts_comparison.png", dpi=300, bbox_inches='tight')
plt.close()
# 3. 时间跨度对比
fig, ax = plt.subplots(figsize=(12, 6))
time_days = [self.analysis_results[name]['temporal_analysis']['total_days']
for name in names]
bars = ax.bar(names, time_days, color='orange', alpha=0.7)
ax.set_title('各数据集时间跨度对比', fontsize=14, fontweight='bold')
ax.set_xlabel('数据集名称')
ax.set_ylabel('时间跨度 (天)')
ax.tick_params(axis='x', rotation=45)
# 添加数值标签
for bar, days in zip(bars, time_days):
ax.text(bar.get_x() + bar.get_width()/2, bar.get_height() + max(time_days)*0.01,
f'{days:.1f}', ha='center', va='bottom')
plt.tight_layout()
plt.savefig(output_path / "time_span_comparison.png", dpi=300, bbox_inches='tight')
plt.close()
# 4. 散点图:节点数 vs 缺失值率
fig, ax = plt.subplots(figsize=(10, 6))
ax.scatter(node_counts, missing_rates, s=100, alpha=0.7, c='red')
# 添加数据集标签
for i, name in enumerate(names):
ax.annotate(name, (node_counts[i], missing_rates[i]),
xytext=(5, 5), textcoords='offset points', fontsize=8)
ax.set_xlabel('节点数量')
ax.set_ylabel('缺失值率 (%)')
ax.set_title('节点数量与缺失值率关系', fontsize=14, fontweight='bold')
ax.grid(True, alpha=0.3)
plt.tight_layout()
plt.savefig(output_path / "nodes_vs_missing_rates.png", dpi=300, bbox_inches='tight')
plt.close()
print(f"可视化图表已保存到目录: {output_path}")
def main():
"""主函数"""
print("BasicTS 数据集分析工具")
print("=" * 50)
# 创建分析器
analyzer = DatasetAnalyzer()
# 分析所有数据集
analyzer.analyze_all_datasets()
# 生成并打印报告
print("\n" + "=" * 80)
print("数据集分析报告")
print("=" * 80)
summary_report = analyzer.generate_summary_report()
print(summary_report)
print("\n" + "=" * 80)
print("对比分析报告")
print("=" * 80)
comparative_report = analyzer.generate_comparative_analysis()
print(comparative_report)
# 保存报告和可视化
analyzer.save_reports()
analyzer.create_visualizations()
print("\n分析完成!")
if __name__ == "__main__":
main()

194
scripts/dataset_analyzer.py Normal file
View File

@ -0,0 +1,194 @@
#!/usr/bin/env python3
# -*- coding: utf-8 -*-
"""
数据集分析器
用于读取BasicTS项目中的数据集并生成详细的报告
"""
import os
import json
import numpy as np
import pandas as pd
import pickle
from pathlib import Path
from typing import Dict, List, Tuple, Optional
from datetime import datetime
import warnings
warnings.filterwarnings('ignore')
class DatasetAnalyzer:
"""数据集分析器"""
def __init__(self, datasets_dir: str = "datasets"):
"""
初始化数据集分析器
Args:
datasets_dir: 数据集目录路径
"""
self.datasets_dir = Path(datasets_dir)
self.datasets_info = {}
self.analysis_results = {}
def get_available_datasets(self) -> List[str]:
"""获取可用的数据集列表"""
datasets = []
for item in self.datasets_dir.iterdir():
if item.is_dir() and (item / "desc.json").exists():
datasets.append(item.name)
return sorted(datasets)
def load_dataset_description(self, dataset_name: str) -> Dict:
"""加载数据集描述文件"""
desc_path = self.datasets_dir / dataset_name / "desc.json"
with open(desc_path, 'r', encoding='utf-8') as f:
return json.load(f)
def load_dataset_data(self, dataset_name: str) -> np.ndarray:
"""加载数据集数据"""
desc = self.load_dataset_description(dataset_name)
data_path = self.datasets_dir / dataset_name / "data.dat"
# 使用memmap加载大数据文件
data = np.memmap(data_path, dtype='float32', mode='r',
shape=tuple(desc['shape']))
return data.copy() # 复制到内存中
def load_adjacency_matrix(self, dataset_name: str) -> Optional[np.ndarray]:
"""加载邻接矩阵(如果存在)"""
adj_path = self.datasets_dir / dataset_name / "adj_mx.pkl"
if adj_path.exists():
with open(adj_path, 'rb') as f:
adj_data = pickle.load(f)
# 处理不同的邻接矩阵格式
if isinstance(adj_data, tuple):
return adj_data[0] # 通常第一个元素是邻接矩阵
elif isinstance(adj_data, dict):
return adj_data.get('adj_mx', adj_data.get('adj', None))
else:
return adj_data
return None
def analyze_missing_values(self, data: np.ndarray, null_val: float = 0.0) -> Dict:
"""分析缺失值"""
# 计算缺失值
if np.isnan(null_val):
missing_mask = np.isnan(data)
else:
missing_mask = (data == null_val)
total_elements = data.size
missing_elements = np.sum(missing_mask)
missing_rate = (missing_elements / total_elements) * 100
# 按时间步分析缺失值
missing_by_time = np.sum(missing_mask, axis=(1, 2)) if data.ndim == 3 else np.sum(missing_mask, axis=1)
missing_by_node = np.sum(missing_mask, axis=(0, 2)) if data.ndim == 3 else np.sum(missing_mask, axis=0)
return {
'total_missing_rate': missing_rate,
'missing_elements': missing_elements,
'total_elements': total_elements,
'missing_by_time': missing_by_time,
'missing_by_node': missing_by_node,
'max_missing_time': np.max(missing_by_time),
'max_missing_node': np.max(missing_by_node) if data.ndim == 3 else 0
}
def analyze_temporal_continuity(self, data: np.ndarray, freq_minutes: int) -> Dict:
"""分析时间连续性"""
# 计算时间跨度
total_time_steps = data.shape[0]
total_hours = (total_time_steps * freq_minutes) / 60
total_days = total_hours / 24
# 计算数据密度(非零数据点比例)
non_zero_ratio = np.sum(data != 0) / data.size
return {
'total_time_steps': total_time_steps,
'frequency_minutes': freq_minutes,
'total_hours': total_hours,
'total_days': total_days,
'data_density': non_zero_ratio
}
def analyze_spatial_coverage(self, data: np.ndarray, adj_matrix: Optional[np.ndarray] = None) -> Dict:
"""分析空间覆盖"""
if data.ndim == 3:
num_nodes = data.shape[1]
num_features = data.shape[2]
else:
num_nodes = data.shape[1]
num_features = 1
# 计算邻接矩阵信息
edge_info = {}
if adj_matrix is not None:
num_edges = np.sum(adj_matrix > 0)
edge_density = num_edges / (num_nodes * num_nodes)
avg_degree = np.mean(np.sum(adj_matrix > 0, axis=1))
edge_info = {
'num_edges': int(num_edges),
'edge_density': edge_density,
'avg_degree': avg_degree,
'max_degree': int(np.max(np.sum(adj_matrix > 0, axis=1))),
'min_degree': int(np.min(np.sum(adj_matrix > 0, axis=1)))
}
return {
'num_nodes': num_nodes,
'num_features': num_features,
**edge_info
}
def analyze_dataset(self, dataset_name: str) -> Dict:
"""分析单个数据集"""
print(f"正在分析数据集: {dataset_name}")
# 加载数据
desc = self.load_dataset_description(dataset_name)
data = self.load_dataset_data(dataset_name)
adj_matrix = self.load_adjacency_matrix(dataset_name)
# 基础信息
basic_info = {
'name': desc['name'],
'domain': desc['domain'],
'shape': desc['shape'],
'has_graph': desc.get('has_graph', False),
'frequency_minutes': desc.get('frequency (minutes)', None)
}
# 缺失值分析
null_val = desc.get('regular_settings', {}).get('NULL_VAL', 0.0)
missing_analysis = self.analyze_missing_values(data, null_val)
# 时间连续性分析
temporal_analysis = self.analyze_temporal_continuity(data, basic_info['frequency_minutes'])
# 空间覆盖分析
spatial_analysis = self.analyze_spatial_coverage(data, adj_matrix)
return {
'basic_info': basic_info,
'missing_analysis': missing_analysis,
'temporal_analysis': temporal_analysis,
'spatial_analysis': spatial_analysis,
'description': desc
}
def analyze_all_datasets(self) -> Dict:
"""分析所有数据集"""
datasets = self.get_available_datasets()
print(f"发现 {len(datasets)} 个数据集: {datasets}")
for dataset_name in datasets:
try:
self.analysis_results[dataset_name] = self.analyze_dataset(dataset_name)
except Exception as e:
print(f"分析数据集 {dataset_name} 时出错: {e}")
continue
return self.analysis_results

View File

@ -0,0 +1,34 @@
import os
import sys
import argparse
def main():
parser = argparse.ArgumentParser(description="Download GPT-2 via kagglehub to target directory")
parser.add_argument("--target", type=str, default="/home/azureuser/code/REPST/GPT-2", help="Target directory to store GPT-2")
args = parser.parse_args()
try:
import kagglehub
except Exception as e:
print("[ERROR] kagglehub 未安装或导入失败。请先运行: pip install kagglehub")
print(" 需在 ~/.kaggle/kaggle.json 配置 Kaggle API。")
sys.exit(1)
os.makedirs(args.target, exist_ok=True)
handle = "openai/gpt-2"
print(f"开始通过 kagglehub 下载 {handle}{args.target} ...")
try:
path = kagglehub.model_download(handle, path=args.target)
except Exception as e:
print(f"[ERROR] 下载失败: {e}")
sys.exit(2)
print(f"下载完成,已保存到: {path}")
if __name__ == "__main__":
main()

279
scripts/report_generator.py Normal file
View File

@ -0,0 +1,279 @@
#!/usr/bin/env python3
# -*- coding: utf-8 -*-
"""
报告生成器
用于生成数据集分析的详细报告
"""
import json
import numpy as np
from pathlib import Path
from datetime import datetime
from typing import Dict, List
import matplotlib.pyplot as plt
import seaborn as sns
# 设置中文字体
plt.rcParams['font.sans-serif'] = ['SimHei', 'DejaVu Sans']
plt.rcParams['axes.unicode_minus'] = False
class ReportGenerator:
"""报告生成器"""
def __init__(self, analysis_results: Dict):
"""
初始化报告生成器
Args:
analysis_results: 数据集分析结果
"""
self.analysis_results = analysis_results
def generate_summary_report(self) -> str:
"""生成汇总报告"""
if not self.analysis_results:
return "没有可用的分析结果"
report = []
report.append("=" * 80)
report.append("BasicTS 数据集分析报告")
report.append("=" * 80)
report.append(f"生成时间: {datetime.now().strftime('%Y-%m-%d %H:%M:%S')}")
report.append(f"分析数据集数量: {len(self.analysis_results)}")
report.append("")
# 数据集概览表
report.append("数据集概览:")
report.append("-" * 80)
report.append(f"{'数据集名称':<15} {'领域':<20} {'时间步数':<10} {'节点数':<8} {'特征数':<8} {'频率(分钟)':<12} {'缺失值率(%)':<12}")
report.append("-" * 80)
for name, result in self.analysis_results.items():
basic = result['basic_info']
missing = result['missing_analysis']
spatial = result['spatial_analysis']
report.append(f"{name:<15} {basic['domain']:<20} {basic['shape'][0]:<10} "
f"{spatial['num_nodes']:<8} {spatial['num_features']:<8} "
f"{basic['frequency_minutes']:<12} {missing['total_missing_rate']:<12.3f}")
report.append("")
# 详细分析
for name, result in self.analysis_results.items():
report.append(f"数据集: {name}")
report.append("-" * 40)
basic = result['basic_info']
missing = result['missing_analysis']
temporal = result['temporal_analysis']
spatial = result['spatial_analysis']
report.append(f"领域: {basic['domain']}")
report.append(f"数据形状: {basic['shape']}")
report.append(f"时间频率: {basic['frequency_minutes']} 分钟")
report.append(f"时间跨度: {temporal['total_days']:.1f} 天 ({temporal['total_hours']:.1f} 小时)")
report.append(f"节点数量: {spatial['num_nodes']}")
report.append(f"特征数量: {spatial['num_features']}")
if spatial.get('num_edges'):
report.append(f"边数量: {spatial['num_edges']}")
report.append(f"边密度: {spatial['edge_density']:.4f}")
report.append(f"平均度数: {spatial['avg_degree']:.2f}")
report.append(f"缺失值率: {missing['total_missing_rate']:.3f}%")
report.append(f"数据密度: {temporal['data_density']:.3f}")
report.append("")
return "\n".join(report)
def generate_comparative_analysis(self) -> str:
"""生成对比分析报告"""
if not self.analysis_results:
return "没有可用的分析结果"
report = []
report.append("=" * 80)
report.append("数据集对比分析")
report.append("=" * 80)
report.append("")
# 按领域分组
domains = {}
for name, result in self.analysis_results.items():
domain = result['basic_info']['domain']
if domain not in domains:
domains[domain] = []
domains[domain].append((name, result))
for domain, datasets in domains.items():
report.append(f"领域: {domain}")
report.append("-" * 40)
# 该领域的数据集统计
missing_rates = [d[1]['missing_analysis']['total_missing_rate'] for d in datasets]
node_counts = [d[1]['spatial_analysis']['num_nodes'] for d in datasets]
time_steps = [d[1]['basic_info']['shape'][0] for d in datasets]
report.append(f"数据集数量: {len(datasets)}")
report.append(f"平均缺失值率: {np.mean(missing_rates):.3f}%")
report.append(f"缺失值率范围: {min(missing_rates):.3f}% - {max(missing_rates):.3f}%")
report.append(f"平均节点数: {np.mean(node_counts):.1f}")
report.append(f"节点数范围: {min(node_counts)} - {max(node_counts)}")
report.append(f"平均时间步数: {np.mean(time_steps):.0f}")
report.append("")
# 空间覆盖密度分析
report.append("空间覆盖密度分析:")
report.append("-" * 40)
spatial_datasets = [(name, result) for name, result in self.analysis_results.items()
if result['spatial_analysis'].get('num_edges')]
if spatial_datasets:
for name, result in spatial_datasets:
spatial = result['spatial_analysis']
report.append(f"{name}: {spatial['num_nodes']} 个节点, {spatial['num_edges']} 条边, "
f"密度 {spatial['edge_density']:.4f}, 平均度数 {spatial['avg_degree']:.2f}")
else:
report.append("没有发现包含图结构的数据集")
report.append("")
# 时间连续性分析
report.append("时间连续性分析:")
report.append("-" * 40)
temporal_data = []
for name, result in self.analysis_results.items():
temporal = result['temporal_analysis']
temporal_data.append({
'name': name,
'days': temporal['total_days'],
'density': temporal['data_density'],
'frequency': temporal['frequency_minutes']
})
# 按时间跨度排序
temporal_data.sort(key=lambda x: x['days'], reverse=True)
for data in temporal_data:
report.append(f"{data['name']}: {data['days']:.1f} 天, "
f"数据密度 {data['density']:.3f}, "
f"频率 {data['frequency']} 分钟")
return "\n".join(report)
def create_visualizations(self, output_dir: str = "analysis_reports"):
"""创建可视化图表"""
if not self.analysis_results:
print("没有可用的分析结果")
return
output_path = Path(output_dir)
output_path.mkdir(exist_ok=True)
# 设置图表样式
plt.style.use('seaborn-v0_8')
# 1. 缺失值率对比
fig, ax = plt.subplots(figsize=(12, 6))
names = list(self.analysis_results.keys())
missing_rates = [self.analysis_results[name]['missing_analysis']['total_missing_rate']
for name in names]
bars = ax.bar(names, missing_rates, color='skyblue', alpha=0.7)
ax.set_title('各数据集缺失值率对比', fontsize=14, fontweight='bold')
ax.set_xlabel('数据集名称')
ax.set_ylabel('缺失值率 (%)')
ax.tick_params(axis='x', rotation=45)
# 添加数值标签
for bar, rate in zip(bars, missing_rates):
ax.text(bar.get_x() + bar.get_width()/2, bar.get_height() + 0.1,
f'{rate:.2f}%', ha='center', va='bottom')
plt.tight_layout()
plt.savefig(output_path / "missing_rates_comparison.png", dpi=300, bbox_inches='tight')
plt.close()
# 2. 节点数量对比
fig, ax = plt.subplots(figsize=(12, 6))
node_counts = [self.analysis_results[name]['spatial_analysis']['num_nodes']
for name in names]
bars = ax.bar(names, node_counts, color='lightgreen', alpha=0.7)
ax.set_title('各数据集节点数量对比', fontsize=14, fontweight='bold')
ax.set_xlabel('数据集名称')
ax.set_ylabel('节点数量')
ax.tick_params(axis='x', rotation=45)
# 添加数值标签
for bar, count in zip(bars, node_counts):
ax.text(bar.get_x() + bar.get_width()/2, bar.get_height() + max(node_counts)*0.01,
f'{count}', ha='center', va='bottom')
plt.tight_layout()
plt.savefig(output_path / "node_counts_comparison.png", dpi=300, bbox_inches='tight')
plt.close()
# 3. 时间跨度对比
fig, ax = plt.subplots(figsize=(12, 6))
time_days = [self.analysis_results[name]['temporal_analysis']['total_days']
for name in names]
bars = ax.bar(names, time_days, color='orange', alpha=0.7)
ax.set_title('各数据集时间跨度对比', fontsize=14, fontweight='bold')
ax.set_xlabel('数据集名称')
ax.set_ylabel('时间跨度 (天)')
ax.tick_params(axis='x', rotation=45)
# 添加数值标签
for bar, days in zip(bars, time_days):
ax.text(bar.get_x() + bar.get_width()/2, bar.get_height() + max(time_days)*0.01,
f'{days:.1f}', ha='center', va='bottom')
plt.tight_layout()
plt.savefig(output_path / "time_span_comparison.png", dpi=300, bbox_inches='tight')
plt.close()
# 4. 散点图:节点数 vs 缺失值率
fig, ax = plt.subplots(figsize=(10, 6))
ax.scatter(node_counts, missing_rates, s=100, alpha=0.7, c='red')
# 添加数据集标签
for i, name in enumerate(names):
ax.annotate(name, (node_counts[i], missing_rates[i]),
xytext=(5, 5), textcoords='offset points', fontsize=8)
ax.set_xlabel('节点数量')
ax.set_ylabel('缺失值率 (%)')
ax.set_title('节点数量与缺失值率关系', fontsize=14, fontweight='bold')
ax.grid(True, alpha=0.3)
plt.tight_layout()
plt.savefig(output_path / "nodes_vs_missing_rates.png", dpi=300, bbox_inches='tight')
plt.close()
print(f"可视化图表已保存到目录: {output_path}")
def save_reports(self, output_dir: str = "analysis_reports"):
"""保存分析报告"""
output_path = Path(output_dir)
output_path.mkdir(exist_ok=True)
# 保存汇总报告
summary_report = self.generate_summary_report()
with open(output_path / "summary_report.txt", 'w', encoding='utf-8') as f:
f.write(summary_report)
# 保存对比分析报告
comparative_report = self.generate_comparative_analysis()
with open(output_path / "comparative_analysis.txt", 'w', encoding='utf-8') as f:
f.write(comparative_report)
# 保存详细JSON报告
with open(output_path / "detailed_analysis.json", 'w', encoding='utf-8') as f:
json.dump(self.analysis_results, f, indent=2, ensure_ascii=False, default=str)
print(f"报告已保存到目录: {output_path}")

View File

@ -0,0 +1,62 @@
#!/usr/bin/env python3
# -*- coding: utf-8 -*-
"""
数据集分析主脚本
运行完整的数据集分析流程
"""
import sys
import os
from pathlib import Path
# 添加项目根目录到Python路径
project_root = Path(__file__).parent.parent
sys.path.insert(0, str(project_root))
from scripts.dataset_analyzer import DatasetAnalyzer
from scripts.report_generator import ReportGenerator
def main():
"""主函数"""
print("BasicTS 数据集分析工具")
print("=" * 50)
# 创建分析器
analyzer = DatasetAnalyzer()
# 分析所有数据集
print("开始分析数据集...")
analysis_results = analyzer.analyze_all_datasets()
if not analysis_results:
print("没有找到可分析的数据集")
return
# 创建报告生成器
report_generator = ReportGenerator(analysis_results)
# 生成并打印报告
print("\n" + "=" * 80)
print("数据集分析报告")
print("=" * 80)
summary_report = report_generator.generate_summary_report()
print(summary_report)
print("\n" + "=" * 80)
print("对比分析报告")
print("=" * 80)
comparative_report = report_generator.generate_comparative_analysis()
print(comparative_report)
# 保存报告和可视化
print("\n正在保存报告和可视化图表...")
report_generator.save_reports()
report_generator.create_visualizations()
print("\n分析完成!")
print("报告文件保存在 'analysis_reports' 目录中")
if __name__ == "__main__":
main()