chore: initialize repo, add .gitignore, data scripts

2025-09-17 14:00:51 +00:00 · 2025-09-17 14:00:51 +00:00 · 8372a7580c
parent 4c279cc747
commit 8372a7580c
48 changed files with 5789 additions and 75 deletions
--- a/.gitignore
+++ b/.gitignore
@ -0,0 +1,66 @@
+
+# Project artifacts
+GPT-2/
+datasets/
+checkpoints/
+log/
+
+# Python
+__pycache__/
+*.pyc
+*.pyo
+*.pyd
+.Python
+.venv/
+venv/
+env/
+.env
+
+# Data & weights
+*.npz
+*.npy
+*.pkl
+*.pt
+*.pth
+*.ckpt
+*.bin
+
+# Logs
+*.log
+
+# VSCode
+.vscode/
+
+# JetBrains / IDEs
+.idea/
+.fleet/
+.vs/
+*.code-workspace
+
+# OS files
+.DS_Store
+Thumbs.db
+
+# Python build & test
+build/
+dist/
+*.egg-info/
+pip-wheel-metadata/
+.pytest_cache/
+.mypy_cache/
+.ruff_cache/
+.tox/
+.nox/
+.coverage
+coverage.xml
+htmlcov/
+.cache/
+
+# Editors swap/history
+*.swp
+*.swo
+.history/
+
+# Jupyter
+.ipynb_checkpoints/
+
--- a/README.md
+++ b/README.md
@ -1,77 +1,31 @@
-<div align="center">
-  <!-- <h1><b> Time-LLM </b></h1> -->
-  <!-- <h2><b> Time-LLM </b></h2> -->
-  <h2><b> (IJCAI'25) RePST: Language Model Empowered Spatio-Temporal Forecasting via Semantic-Oriented Reprogramming </b></h2>
-</div>
+RePST 修复版

+准备GPT-2预训练权重

+```bash
+mkdir GPT-2
+wget https://huggingface.co/openai-community/gpt2/resolve/main/config.json?download=true -O ./GPT-2/config.json
+wget https://huggingface.co/openai-community/gpt2/resolve/main/pytorch_model.bin?download=true -O ./GPT-2/pytorch_model.bin
+````

+准备PEMS-BAY数据集，按照[BasicTS](https://github.com/GestaltCogTeam/BasicTS/blob/master/tutorial/dataset_design.md)方法准备

+[Google Drive](https://drive.google.com/drive/folders/14EJVODCU48fGK0FkyeVom_9lETh80Yjp?usp=sharing) 可使用gdown下载。
+解压后，确保 `./datasets/PEMS-BAY`  文件夹内具有 `adj_mx.pkl, data.dat, desc,json文件`， 然后运行脚本

+```bash
+python prepare_pems_bay.py
+````

---
->
-> 🙋 Please let us know if you find out a mistake or have any suggestions!
->
-> 🐝 The full version of this paper can be accessed at https://arxiv.org/abs/2408.14505.
->
-> 🌟 If you find this resource helpful, please consider to star this repository and cite our research:
+在PEMS-BAY数据集文件夹下生成 `train.npz, val.npz, test.npz`

-```
-@inproceedings{wang2025repst,
-  title={RePST: Language Model Empowered Spatio-Temporal Forecasting via Semantic-Oriented Reprogramming},
-  author={Wang, Hao and Han, Jindong and Fan, Wei and Sun, Leilei and Liu, Hao},
-  booktitle={Proceedings of the 34th International Joint Conference on Artificial Intelligence},
-  year={2025}
-}
-```
+根据BasicTS仓库配置BasicTS环境，亦或是使用

+`pip install -r requirement.txt`

-## Introduction
-This repository contains the implementation of REPST, a framework for spatio-temporal forecasting that leverages the reasoning and generalization capabilities of Pre-trained Language Models (PLMs). REPST utilizes a semantic-aware spatio-temporal decomposer and selective discrete reprogramming to enable PLMs to handle complex spatio-temporal data, especially in data-scarce environments.
+我是直接使用现有的BasicTS环境，因此没有做过测试

-<p align="center">
-<img src="./figures/repst.png" height = "360" alt="" align=center />
-</p>
-
- RePST comprises two key components: (1) a dynamic mode decomposition approach that disentangles spatially correlated time series into interpretable components, and (2) an expanded spatio-temporal vocabulary that helps PLMs better understand the dynamics of complex spatio-temporal systems, to guide PLM reasoning.
-
-<p align="center">
-<img src="./figures/method-detailed-illustration.png" height = "190" alt="" align=center />
-</p>
-
-## Requirements
-Use python 3.11 from MiniConda
-
- torch==2.0.1
- accelerate==0.28.0
- einops==0.6.0
- matplotlib==3.7.0
- numpy==1.24.4
- pandas==2.1.4
- scikit_learn==1.3.2
- scipy==1.11.4
- tqdm==4.66.1
- transformers==4.36.2
-
-
-To install all dependencies:
-```
-pip install -r requirements.txt
-```
-
-## Datasets
-# Pending
-You can access the well pre-processed datasets from [[Google Drive]](https://drive.google.com/), then place the downloaded contents under `./dataset`
-
-
-
-## Detailed usage
-
-Please refer to ```run.py``` for the detailed description of each hyperparameter.
-
-
-
-
-## Acknowledgement
-Our baseline model implementation adapts [BasicTS](https://github.com/GestaltCogTeam/BasicTS) as the code base and have extensively modified it to our purposes. We thank the authors for sharing their implementations and related resources.
+开跑
+```python
+python run.py --root_path datasets --data_path PEMS-BAY --device cuda:0 --seq_len 12 --pred_len 12
+```
--- a/models/reprogramming.py
+++ b/models/reprogramming.py
@ -2,6 +2,7 @@ import torch
 import torch.nn as nn
 import torch.nn.functional as F
 from torch import Tensor
+from math import sqrt


 class ReplicationPad1d(nn.Module):
@ -20,7 +21,7 @@ class TokenEmbedding(nn.Module):
        padding = 1 
        self.tokenConv = nn.Conv1d(in_channels=c_in, out_channels=d_model,
                                   kernel_size=3, padding=padding, padding_mode='circular', bias=False)
-        self.confusion_layer = nn.Linear(12, 1)
+        self.confusion_layer = nn.LazyLinear(1)  
        # if air_quality
        # self.confusion_layer = nn.Linear(42, 1)
     
@ -31,8 +32,8 @@ class TokenEmbedding(nn.Module):
                    m.weight, mode='fan_in', nonlinearity='leaky_relu')

    def forward(self, x):
-        b, n, m, pn, pl = x.shape
-        x = self.tokenConv(x.reshape(b*n, pl, m*pn))
+        b, n, m, pn, pl = x.shape  # batch, node, feature, patch_num, patch_len
+        x = self.tokenConv(x.reshape(b*n, pl, m*pn))  # batch*node, patch_len, feature*patch_num
    
        x = self.confusion_layer(x)
        return x.reshape(b, n, -1)
--- a/models/repst.py
+++ b/models/repst.py
@ -7,8 +7,8 @@ from transformers.models.gpt2.modeling_gpt2 import GPT2Model
 from transformers import GPT2Model, GPT2Config

 from einops import rearrange
-from reprogramming import *
-from normalizer import *
+from .reprogramming import *
+from .normalizer import *

 class repst(nn.Module):

@ -34,9 +34,6 @@ class repst(nn.Module):

        self.patch_embedding = PatchEmbedding(self.d_model, self.patch_len, self.stride, self.dropout)

-
-
-
        self.gpts = GPT2Model.from_pretrained('./GPT-2', output_attentions=True, output_hidden_states=True)
        self.gpts.h = self.gpts.h[:self.gpt_layers]
    
--- a/prepare_pems_bay.py
+++ b/prepare_pems_bay.py
@ -0,0 +1,115 @@
+import os
+import json
+import argparse
+import numpy as np
+
+
+def generate_offsets(seq_length_x: int, seq_length_y: int):
+    x_offsets = np.sort(np.concatenate((np.arange(-(seq_length_x - 1), 1, 1),)))
+    y_offsets = np.sort(np.arange(1, seq_length_y + 1, 1))
+    return x_offsets, y_offsets
+
+
+def make_sliding_windows(data: np.ndarray, x_offsets: np.ndarray, y_offsets: np.ndarray):
+    # data: (T, N, C)
+    num_samples = data.shape[0]
+    min_t = abs(int(np.min(x_offsets)))
+    max_t = num_samples - int(np.max(y_offsets))
+
+    x, y = [], []
+    for t in range(min_t, max_t):
+        x.append(data[t + x_offsets, ...])  # (seq_len, N, C)
+        y.append(data[t + y_offsets, ...])  # (pred_len, N, C)
+
+    x = np.stack(x, axis=0).astype(np.float32)  # (S, seq_len, N, C)
+    y = np.stack(y, axis=0).astype(np.float32)  # (S, pred_len, N, C)
+
+    # Reorder to (S, N, L, C) to match model expectation: b n l m
+    x = np.transpose(x, (0, 2, 1, 3))
+    y = np.transpose(y, (0, 2, 1, 3))
+    return x, y
+
+
+def split_by_ratio(x: np.ndarray, y: np.ndarray, ratios):
+    r_train, r_val, r_test = ratios
+    num_samples = x.shape[0]
+    n_train = int(round(num_samples * r_train))
+    n_val = int(round(num_samples * r_val))
+    n_test = num_samples - n_train - n_val
+
+    x_train, y_train = x[:n_train], y[:n_train]
+    x_val, y_val = x[n_train:n_train + n_val], y[n_train:n_train + n_val]
+    x_test, y_test = x[-n_test:], y[-n_test:]
+    return (x_train, y_train), (x_val, y_val), (x_test, y_test)
+
+
+def main():
+    parser = argparse.ArgumentParser(description="Prepare PEMS-BAY to train/val/test .npz")
+    parser.add_argument("--dataset_dir", type=str, default='./datasets/PEMS-BAY', help="Path to datasets/PEMS-BAY directory")
+    parser.add_argument("--seq_len", type=int, default=12)
+    parser.add_argument("--pred_len", type=int, default=12)
+    parser.add_argument("--speed_channel_only", action="store_true", help="Use only the first channel (speed)")
+    args = parser.parse_args()
+
+    dataset_dir = args.dataset_dir
+    desc_path = os.path.join(dataset_dir, "desc.json")
+    data_path = os.path.join(dataset_dir, "data.dat")
+
+    if not os.path.exists(desc_path):
+        raise FileNotFoundError(f"desc.json not found at {desc_path}")
+    if not os.path.exists(data_path):
+        raise FileNotFoundError(f"data.dat not found at {data_path}")
+
+    with open(desc_path, "r") as f:
+        desc = json.load(f)
+
+    shape = desc.get("shape")  # expected [T, N, C]
+    if not shape or len(shape) not in (2, 3):
+        raise ValueError(f"Invalid shape in desc.json: {shape}")
+
+    total_elems = int(np.prod(shape)) if len(shape) == 3 else int(np.prod(shape) * 1)
+    raw = np.fromfile(data_path, dtype=np.float32)
+    if raw.size != total_elems:
+        # Try infer last dim as 1 if desc has 2 dims
+        if len(shape) == 2 and raw.size == shape[0] * shape[1]:
+            pass
+        else:
+            raise ValueError(f"data.dat size mismatch. desc={shape}, fromfile={raw.size}")
+
+    if len(shape) == 3:
+        data = raw.reshape(shape)
+    else:
+        data = raw.reshape(shape + [1])  # (T, N, 1)
+
+    # Use only speed channel for this model (expects C=1)
+    if data.shape[-1] > 1:
+        data = data[..., :1]
+
+    x_offsets, y_offsets = generate_offsets(args.seq_len, args.pred_len)
+    x, y = make_sliding_windows(data, x_offsets, y_offsets)
+
+    ratios = desc.get("regular_settings", {}).get("TRAIN_VAL_TEST_RATIO", [0.7, 0.1, 0.2])
+    (x_train, y_train), (x_val, y_val), (x_test, y_test) = split_by_ratio(x, y, ratios)
+
+    for split_name, _x, _y in (
+        ("train", x_train, y_train),
+        ("val", x_val, y_val),
+        ("test", x_test, y_test),
+    ):
+        out_path = os.path.join(dataset_dir, f"{split_name}.npz")
+        np.savez_compressed(
+            out_path,
+            x=_x,
+            y=_y,
+            x_offsets=x_offsets.reshape(list(x_offsets.shape) + [1]),
+            y_offsets=y_offsets.reshape(list(y_offsets.shape) + [1]),
+        )
+        print(f"Saved {split_name} -> {out_path} | x={_x.shape}, y={_y.shape}")
+
+    print("Done.")
+
+
+if __name__ == "__main__":
+    main()
+
+
--- a/scripts/data_preparation/BLAST/merge_data.py
+++ b/scripts/data_preparation/BLAST/merge_data.py
@ -0,0 +1,45 @@
+import os
+from argparse import ArgumentParser
+
+import numpy as np
+from tqdm import tqdm
+
+data_dir_path = 'datasets/BLAST/train'
+
+def main(clean_cache=False):
+
+    num_samples = 0
+    for i in range(99):
+        shape = tuple(np.load(data_dir_path + f'/shape_{i}_99.npy'))
+        N, L = shape
+        num_samples += N
+
+    merged_data = np.memmap(data_dir_path + '/data.dat', mode='w+', dtype=np.float32, shape=(num_samples, L))
+
+    print('Merging data...')
+    current_index = 0
+    for i in tqdm(range(99)):
+        shape = tuple(np.load(data_dir_path + f'/shape_{i}_99.npy'))
+        data = np.memmap(data_dir_path + f'/data_{i}_99.dat', mode='r', dtype=np.float32, shape=shape)
+        merged_data[current_index:current_index + shape[0]] = data
+        current_index += shape[0]
+
+    shape = merged_data.shape
+    np.save(data_dir_path + '/shape.npy', shape)
+
+    print('Data merged successfully.')
+    if clean_cache:
+        print('Cleaning cache...')
+        for i in tqdm(range(99)):
+            os.remove(data_dir_path + f'/data_{i}_99.dat')
+            os.remove(data_dir_path + f'/shape_{i}_99.npy')
+        print('Cache cleaned.')
+
+def parse_args():
+    parser = ArgumentParser(description='Merge data files into a single memmap file.')
+    parser.add_argument('--clean_cache', default=True, help='Clean cache after merging.')
+    return parser.parse_args()
+
+if __name__ == '__main__':
+    args = parse_args()
+    main(clean_cache=args.clean_cache)
--- a/scripts/data_preparation/BeijingAirQuality/generate_training_data.py
+++ b/scripts/data_preparation/BeijingAirQuality/generate_training_data.py
@ -0,0 +1,103 @@
+import json
+import os
+
+import numpy as np
+import pandas as pd
+
+# Hyperparameters
+dataset_name = 'BeijingAirQuality'
+data_file_path = f'datasets/raw_data/{dataset_name}/{dataset_name}.xlsx'
+graph_file_path = None
+output_dir = f'datasets/{dataset_name}'
+target_channel = [0]  # Target traffic flow channel
+add_time_of_day = True  # Add time of day as a feature
+add_day_of_week = True  # Add day of the week as a feature
+steps_per_day = 24  # Number of time steps per day
+frequency = 1440 // steps_per_day
+domain = 'Beijing air quality'
+feature_description = [domain, 'time of day', 'day of week']
+regular_settings = {
+    'INPUT_LEN': 336,
+    'OUTPUT_LEN': 336,
+    'TRAIN_VAL_TEST_RATIO': [0.6, 0.2, 0.2],
+    'NORM_EACH_CHANNEL': True,
+    'RESCALE': False,
+    'METRICS': ['MAE', 'MSE'],
+    'NULL_VAL': np.nan
+}
+
+def load_and_preprocess_data():
+    '''Load and preprocess raw data, selecting the specified channel(s).'''
+    df = pd.read_excel(data_file_path)
+    data = df.values
+    colums = df.columns
+    data = np.expand_dims(df.values, axis=-1)
+    data = data[..., target_channel]
+    print(f'Raw time series shape: {data.shape}')
+    print('Columns: {0}'.format(colums))
+    return data
+
+def add_temporal_features(data):
+    '''Add time of day and day of week as features to the data.'''
+    l, n, _ = data.shape
+    feature_list = [data]
+
+    if add_time_of_day:
+        time_of_day = np.array([i % steps_per_day / steps_per_day for i in range(l)])
+        time_of_day_tiled = np.tile(time_of_day, [1, n, 1]).transpose((2, 1, 0))
+        feature_list.append(time_of_day_tiled)
+
+    if add_day_of_week:
+        day_of_week = np.array([(i // steps_per_day) % 7 / 7 for i in range(l)])
+        day_of_week_tiled = np.tile(day_of_week, [1, n, 1]).transpose((2, 1, 0))
+        feature_list.append(day_of_week_tiled)
+
+    data_with_features = np.concatenate(feature_list, axis=-1)  # L x N x C
+    return data_with_features
+
+def save_data(data):
+    '''Save the preprocessed data to a binary file.'''
+    if not os.path.exists(output_dir):
+        os.makedirs(output_dir)
+    file_path = os.path.join(output_dir, 'data.dat')
+    fp = np.memmap(file_path, dtype='float32', mode='w+', shape=data.shape)
+    fp[:] = data[:]
+    fp.flush()
+    del fp
+    print(f'Data saved to {file_path}')
+
+def save_description(data):
+    '''Save a description of the dataset to a JSON file.'''
+    description = {
+        'name': dataset_name,
+        'domain': domain,
+        'shape': data.shape,
+        'num_time_steps': data.shape[0],
+        'num_nodes': data.shape[1],
+        'num_features': data.shape[2],
+        'feature_description': feature_description,
+        'has_graph': graph_file_path is not None,
+        'frequency (minutes)': frequency,
+        'regular_settings': regular_settings
+    }
+    description_path = os.path.join(output_dir, 'desc.json')
+    with open(description_path, 'w') as f:
+        json.dump(description, f, indent=4)
+    print(f'Description saved to {description_path}')
+    print(description)
+
+def main():
+    # Load and preprocess data
+    data = load_and_preprocess_data()
+
+    # Add temporal features
+    data_with_features = add_temporal_features(data)
+
+    # Save processed data
+    save_data(data_with_features)
+
+    # Save dataset description
+    save_description(data_with_features)
+
+if __name__ == '__main__':
+    main()
--- a/scripts/data_preparation/CA/generate_training_data.py
+++ b/scripts/data_preparation/CA/generate_training_data.py
@ -0,0 +1,136 @@
+import json
+import os
+import pickle
+import shutil
+
+import numpy as np
+import pandas as pd
+
+# Hyperparameters
+dataset_name = 'CA'
+data_file_path = f'datasets/raw_data/{dataset_name}/{dataset_name}.h5'
+graph_file_path = f'datasets/raw_data/{dataset_name}/adj_{dataset_name}.npy'
+meta_file_path = f'datasets/raw_data/{dataset_name}/meta_{dataset_name}.csv'
+output_dir = f'datasets/{dataset_name}'
+target_channel = [0]  # Target traffic flow channel
+add_time_of_day = True  # Add time of day as a feature
+add_day_of_week = True  # Add day of the week as a feature
+add_day_of_month = False  # Add day of the month as a feature
+add_day_of_year = False  # Add day of the year as a feature
+steps_per_day = 96  # Number of time steps per day
+frequency = 1440 // steps_per_day
+domain = 'traffic flow'
+feature_description = [domain, 'time of day', 'day of week']
+regular_settings = {
+    'INPUT_LEN': 12,
+    'OUTPUT_LEN': 12,
+    'TRAIN_VAL_TEST_RATIO': [0.6, 0.2, 0.2],
+    'NORM_EACH_CHANNEL': False,
+    'RESCALE': True,
+    'METRICS': ['MAE', 'RMSE', 'MAPE'],
+    'NULL_VAL': 0.0
+}
+
+def load_and_preprocess_data():
+    '''Load and preprocess raw data, selecting the specified channel(s).'''
+    df = pd.read_hdf(data_file_path)
+    data = np.expand_dims(df.values, axis=-1)
+    data = data[..., target_channel]
+    print(f'Raw time series shape: {data.shape}')
+    return data, df
+
+def add_temporal_features(data, df):
+    '''Add time of day and day of week as features to the data.'''
+    _, n, _ = data.shape
+    feature_list = [data]
+
+    if add_time_of_day:
+        time_of_day = (df.index.values - df.index.values.astype('datetime64[D]')) / np.timedelta64(1, 'D')
+        time_of_day_tiled = np.tile(time_of_day, [1, n, 1]).transpose((2, 1, 0))
+        feature_list.append(time_of_day_tiled)
+
+    if add_day_of_week:
+        day_of_week = df.index.dayofweek / 7
+        day_of_week_tiled = np.tile(day_of_week, [1, n, 1]).transpose((2, 1, 0))
+        feature_list.append(day_of_week_tiled)
+
+    if add_day_of_month:
+        # numerical day_of_month
+        day_of_month = (df.index.day - 1 ) / 31 # df.index.day starts from 1. We need to minus 1 to make it start from 0.
+        day_of_month_tiled = np.tile(day_of_month, [1, n, 1]).transpose((2, 1, 0))
+        feature_list.append(day_of_month_tiled)
+
+    if add_day_of_year:
+        # numerical day_of_year
+        day_of_year = (df.index.dayofyear - 1) / 366 # df.index.month starts from 1. We need to minus 1 to make it start from 0.
+        day_of_year_tiled = np.tile(day_of_year, [1, n, 1]).transpose((2, 1, 0))
+        feature_list.append(day_of_year_tiled)
+
+    data_with_features = np.concatenate(feature_list, axis=-1)  # L x N x C
+    return data_with_features
+
+def save_data(data):
+    '''Save the preprocessed data to a binary file.'''
+    if not os.path.exists(output_dir):
+        os.makedirs(output_dir)
+    file_path = os.path.join(output_dir, 'data.dat')
+    fp = np.memmap(file_path, dtype='float32', mode='w+', shape=data.shape)
+    fp[:] = data[:]
+    fp.flush()
+    del fp
+    print(f'Data saved to {file_path}')
+
+def save_graph():
+    '''Save the adjacency matrix to the output directory.'''
+    output_graph_path = os.path.join(output_dir, 'adj_mx.pkl')
+    adj_mx = np.load(graph_file_path)
+    with open(output_dir + '/adj_mx.pkl', 'wb') as f:
+        pickle.dump(adj_mx, f)
+    print(f'Adjacency matrix saved to {output_graph_path}')
+
+def save_meta_data():
+    '''Save the meta data to the output directory'''
+    output_meta_data_path = os.path.join(output_dir, 'meta.csv')
+    shutil.copyfile(meta_file_path, output_meta_data_path)
+
+def save_description(data):
+    '''Save a description of the dataset to a JSON file.'''
+    description = {
+        'name': dataset_name,
+        'domain': domain,
+        'shape': data.shape,
+        'num_time_steps': data.shape[0],
+        'num_nodes': data.shape[1],
+        'num_features': data.shape[2],
+        'feature_description': feature_description,
+        'has_graph': graph_file_path is not None,
+        'frequency (minutes)': frequency,
+        'regular_settings': regular_settings
+    }
+    description_path = os.path.join(output_dir, 'desc.json')
+    with open(description_path, 'w') as f:
+        json.dump(description, f, indent=4)
+    print(f'Description saved to {description_path}')
+    print(description)
+
+def main():
+    # Load and preprocess data
+    data, df = load_and_preprocess_data()
+
+    # Add temporal features
+    data_with_features = add_temporal_features(data, df)
+
+    # Save processed data
+    save_data(data_with_features)
+
+    # Copy and save adjacency matrix
+    save_graph()
+
+    # Copy and save meta data
+    save_meta_data()
+
+    # Save dataset description
+    save_description(data_with_features)
+
+if __name__ == '__main__':
+    main()
--- a/scripts/data_preparation/ETTh1/generate_training_data.py
+++ b/scripts/data_preparation/ETTh1/generate_training_data.py
@ -0,0 +1,121 @@
+import json
+import os
+
+import numpy as np
+import pandas as pd
+
+# Hyperparameters
+dataset_name = 'ETTh1'
+data_file_path = f'datasets/raw_data/{dataset_name}/{dataset_name}.csv'
+graph_file_path = None
+output_dir = f'datasets/{dataset_name}'
+target_channel = [0]  # Target traffic flow channel
+add_time_of_day = True  # Add time of day as a feature
+add_day_of_week = True  # Add day of the week as a feature
+add_day_of_month = True  # Add day of the month as a feature
+add_day_of_year = True  # Add day of the year as a feature
+steps_per_day = 24  # Number of time steps per day
+frequency = 1440 // steps_per_day
+domain = 'electricity transformer temperature'
+feature_description = [domain, 'time of day', 'day of week', 'day of month', 'day of year']
+regular_settings = {
+    'INPUT_LEN': 336,
+    'OUTPUT_LEN': 336,
+    'TRAIN_VAL_TEST_RATIO': [0.6, 0.2, 0.2],
+    'NORM_EACH_CHANNEL': True,
+    'RESCALE': False,
+    'METRICS': ['MAE', 'MSE'],
+    'NULL_VAL': np.nan
+}
+
+def load_and_preprocess_data():
+    '''Load and preprocess raw data, selecting the specified channel(s).'''
+    df = pd.read_csv(data_file_path)
+    df = df.iloc[:20*30*24]
+    df_index = pd.to_datetime(df['date'].values, format='%Y-%m-%d %H:%M:%S').to_numpy()
+    df = df[df.columns[1:]]
+    df.index = df_index
+    data = np.expand_dims(df.values, axis=-1)
+    data = data[..., target_channel]
+    print(f'Raw time series shape: {data.shape}')
+    return data, df
+
+def add_temporal_features(data, df):
+    '''Add time of day and day of week as features to the data.'''
+    l, n, _ = data.shape
+    feature_list = [data]
+
+    if add_time_of_day:
+        # numerical time_of_day
+        tod = [i % steps_per_day / steps_per_day for i in range(l)]
+        tod = np.array(tod)
+        tod_tiled = np.tile(tod, [1, n, 1]).transpose((2, 1, 0))
+        feature_list.append(tod_tiled)
+
+    if add_day_of_week:
+        # numerical day_of_week
+        dow = df.index.dayofweek / 7
+        dow_tiled = np.tile(dow, [1, n, 1]).transpose((2, 1, 0))
+        feature_list.append(dow_tiled)
+
+    if add_day_of_month:
+        # numerical day_of_month
+        dom = (df.index.day - 1) / 31 # df.index.day starts from 1. We need to minus 1 to make it start from 0.
+        dom_tiled = np.tile(dom, [1, n, 1]).transpose((2, 1, 0))
+        feature_list.append(dom_tiled)
+
+    if add_day_of_year:
+        # numerical day_of_year
+        doy = (df.index.dayofyear - 1) / 366 # df.index.month starts from 1. We need to minus 1 to make it start from 0.
+        doy_tiled = np.tile(doy, [1, n, 1]).transpose((2, 1, 0))
+        feature_list.append(doy_tiled)
+
+    data_with_features = np.concatenate(feature_list, axis=-1)  # L x N x C
+    return data_with_features
+
+def save_data(data):
+    '''Save the preprocessed data to a binary file.'''
+    if not os.path.exists(output_dir):
+        os.makedirs(output_dir)
+    file_path = os.path.join(output_dir, 'data.dat')
+    fp = np.memmap(file_path, dtype='float32', mode='w+', shape=data.shape)
+    fp[:] = data[:]
+    fp.flush()
+    del fp
+    print(f'Data saved to {file_path}')
+
+def save_description(data):
+    '''Save a description of the dataset to a JSON file.'''
+    description = {
+        'name': dataset_name,
+        'domain': domain,
+        'shape': data.shape,
+        'num_time_steps': data.shape[0],
+        'num_nodes': data.shape[1],
+        'num_features': data.shape[2],
+        'feature_description': feature_description,
+        'has_graph': graph_file_path is not None,
+        'frequency (minutes)': frequency,
+        'regular_settings': regular_settings,
+    }
+    description_path = os.path.join(output_dir, 'desc.json')
+    with open(description_path, 'w') as f:
+        json.dump(description, f, indent=4)
+    print(f'Description saved to {description_path}')
+    print(description)
+
+def main():
+    # Load and preprocess data
+    data, df = load_and_preprocess_data()
+
+    # Add temporal features
+    data_with_features = add_temporal_features(data, df)
+
+    # Save processed data
+    save_data(data_with_features)
+
+    # Save dataset description
+    save_description(data_with_features)
+
+if __name__ == '__main__':
+    main()
--- a/scripts/data_preparation/ETTh2/generate_training_data.py
+++ b/scripts/data_preparation/ETTh2/generate_training_data.py
@ -0,0 +1,121 @@
+import json
+import os
+
+import numpy as np
+import pandas as pd
+
+# Hyperparameters
+dataset_name = 'ETTh2'
+data_file_path = f'datasets/raw_data/{dataset_name}/{dataset_name}.csv'
+graph_file_path = None
+output_dir = f'datasets/{dataset_name}'
+target_channel = [0]  # Target traffic flow channel
+add_time_of_day = True  # Add time of day as a feature
+add_day_of_week = True  # Add day of the week as a feature
+add_day_of_month = True  # Add day of the month as a feature
+add_day_of_year = True  # Add day of the year as a feature
+steps_per_day = 24  # Number of time steps per day
+frequency = 1440 // steps_per_day
+domain = 'electricity transformer temperature'
+feature_description = [domain, 'time of day', 'day of week', 'day of month', 'day of year']
+regular_settings = {
+    'INPUT_LEN': 336,
+    'OUTPUT_LEN': 336,
+    'TRAIN_VAL_TEST_RATIO': [0.6, 0.2, 0.2],
+    'NORM_EACH_CHANNEL': True,
+    'RESCALE': False,
+    'METRICS': ['MAE', 'MSE'],
+    'NULL_VAL': np.nan
+}
+
+def load_and_preprocess_data():
+    '''Load and preprocess raw data, selecting the specified channel(s).'''
+    df = pd.read_csv(data_file_path)
+    df = df.iloc[:20*30*24]
+    df_index = pd.to_datetime(df['date'].values, format='%Y-%m-%d %H:%M:%S').to_numpy()
+    df = df[df.columns[1:]]
+    df.index = df_index
+    data = np.expand_dims(df.values, axis=-1)
+    data = data[..., target_channel]
+    print(f'Raw time series shape: {data.shape}')
+    return data, df
+
+def add_temporal_features(data, df):
+    '''Add time of day and day of week as features to the data.'''
+    l, n, _ = data.shape
+    feature_list = [data]
+
+    if add_time_of_day:
+        # numerical time_of_day
+        tod = [i % steps_per_day / steps_per_day for i in range(l)]
+        tod = np.array(tod)
+        tod_tiled = np.tile(tod, [1, n, 1]).transpose((2, 1, 0))
+        feature_list.append(tod_tiled)
+
+    if add_day_of_week:
+        # numerical day_of_week
+        dow = df.index.dayofweek / 7
+        dow_tiled = np.tile(dow, [1, n, 1]).transpose((2, 1, 0))
+        feature_list.append(dow_tiled)
+
+    if add_day_of_month:
+        # numerical day_of_month
+        dom = (df.index.day - 1) / 31 # df.index.day starts from 1. We need to minus 1 to make it start from 0.
+        dom_tiled = np.tile(dom, [1, n, 1]).transpose((2, 1, 0))
+        feature_list.append(dom_tiled)
+
+    if add_day_of_year:
+        # numerical day_of_year
+        doy = (df.index.dayofyear - 1) / 366 # df.index.month starts from 1. We need to minus 1 to make it start from 0.
+        doy_tiled = np.tile(doy, [1, n, 1]).transpose((2, 1, 0))
+        feature_list.append(doy_tiled)
+
+    data_with_features = np.concatenate(feature_list, axis=-1)  # L x N x C
+    return data_with_features
+
+def save_data(data):
+    '''Save the preprocessed data to a binary file.'''
+    if not os.path.exists(output_dir):
+        os.makedirs(output_dir)
+    file_path = os.path.join(output_dir, 'data.dat')
+    fp = np.memmap(file_path, dtype='float32', mode='w+', shape=data.shape)
+    fp[:] = data[:]
+    fp.flush()
+    del fp
+    print(f'Data saved to {file_path}')
+
+def save_description(data):
+    '''Save a description of the dataset to a JSON file.'''
+    description = {
+        'name': dataset_name,
+        'domain': domain,
+        'shape': data.shape,
+        'num_time_steps': data.shape[0],
+        'num_nodes': data.shape[1],
+        'num_features': data.shape[2],
+        'feature_description': feature_description,
+        'has_graph': graph_file_path is not None,
+        'frequency (minutes)': frequency,
+        'regular_settings': regular_settings
+    }
+    description_path = os.path.join(output_dir, 'desc.json')
+    with open(description_path, 'w') as f:
+        json.dump(description, f, indent=4)
+    print(f'Description saved to {description_path}')
+    print(description)
+
+def main():
+    # Load and preprocess data
+    data, df = load_and_preprocess_data()
+
+    # Add temporal features
+    data_with_features = add_temporal_features(data, df)
+
+    # Save processed data
+    save_data(data_with_features)
+
+    # Save dataset description
+    save_description(data_with_features)
+
+if __name__ == '__main__':
+    main()
--- a/scripts/data_preparation/ETTm1/generate_training_data.py
+++ b/scripts/data_preparation/ETTm1/generate_training_data.py
@ -0,0 +1,121 @@
+import json
+import os
+
+import numpy as np
+import pandas as pd
+
+# Hyperparameters
+dataset_name = 'ETTm1'
+data_file_path = f'datasets/raw_data/{dataset_name}/{dataset_name}.csv'
+graph_file_path = None
+output_dir = f'datasets/{dataset_name}'
+target_channel = [0]  # Target traffic flow channel
+add_time_of_day = True  # Add time of day as a feature
+add_day_of_week = True  # Add day of the week as a feature
+add_day_of_month = True  # Add day of the month as a feature
+add_day_of_year = True  # Add day of the year as a feature
+steps_per_day = 96  # Number of time steps per day
+frequency = 1440 // steps_per_day
+domain = 'electricity transformer temperature'
+feature_description = [domain, 'time of day', 'day of week', 'day of month', 'day of year']
+regular_settings = {
+    'INPUT_LEN': 336,
+    'OUTPUT_LEN': 336,
+    'TRAIN_VAL_TEST_RATIO': [0.6, 0.2, 0.2],
+    'NORM_EACH_CHANNEL': True,
+    'RESCALE': False,
+    'METRICS': ['MAE', 'MSE'],
+    'NULL_VAL': np.nan
+}
+
+def load_and_preprocess_data():
+    '''Load and preprocess raw data, selecting the specified channel(s).'''
+    df = pd.read_csv(data_file_path)
+    df = df.iloc[:20*30*24*4]
+    df_index = pd.to_datetime(df['date'].values, format='%Y-%m-%d %H:%M:%S').to_numpy()
+    df = df[df.columns[1:]]
+    df.index = df_index
+    data = np.expand_dims(df.values, axis=-1)
+    data = data[..., target_channel]
+    print(f'Raw time series shape: {data.shape}')
+    return data, df
+
+def add_temporal_features(data, df):
+    '''Add time of day and day of week as features to the data.'''
+    l, n, _ = data.shape
+    feature_list = [data]
+
+    if add_time_of_day:
+        # numerical time_of_day
+        tod = [i % steps_per_day / steps_per_day for i in range(l)]
+        tod = np.array(tod)
+        tod_tiled = np.tile(tod, [1, n, 1]).transpose((2, 1, 0))
+        feature_list.append(tod_tiled)
+
+    if add_day_of_week:
+        # numerical day_of_week
+        dow = df.index.dayofweek / 7
+        dow_tiled = np.tile(dow, [1, n, 1]).transpose((2, 1, 0))
+        feature_list.append(dow_tiled)
+
+    if add_day_of_month:
+        # numerical day_of_month
+        dom = (df.index.day - 1) / 31 # df.index.day starts from 1. We need to minus 1 to make it start from 0.
+        dom_tiled = np.tile(dom, [1, n, 1]).transpose((2, 1, 0))
+        feature_list.append(dom_tiled)
+
+    if add_day_of_year:
+        # numerical day_of_year
+        doy = (df.index.dayofyear - 1) / 366 # df.index.month starts from 1. We need to minus 1 to make it start from 0.
+        doy_tiled = np.tile(doy, [1, n, 1]).transpose((2, 1, 0))
+        feature_list.append(doy_tiled)
+
+    data_with_features = np.concatenate(feature_list, axis=-1)  # L x N x C
+    return data_with_features
+
+def save_data(data):
+    '''Save the preprocessed data to a binary file.'''
+    if not os.path.exists(output_dir):
+        os.makedirs(output_dir)
+    file_path = os.path.join(output_dir, 'data.dat')
+    fp = np.memmap(file_path, dtype='float32', mode='w+', shape=data.shape)
+    fp[:] = data[:]
+    fp.flush()
+    del fp
+    print(f'Data saved to {file_path}')
+
+def save_description(data):
+    '''Save a description of the dataset to a JSON file.'''
+    description = {
+        'name': dataset_name,
+        'domain': domain,
+        'shape': data.shape,
+        'num_time_steps': data.shape[0],
+        'num_nodes': data.shape[1],
+        'num_features': data.shape[2],
+        'feature_description': feature_description,
+        'has_graph': graph_file_path is not None,
+        'frequency (minutes)': frequency,
+        'regular_settings': regular_settings
+    }
+    description_path = os.path.join(output_dir, 'desc.json')
+    with open(description_path, 'w') as f:
+        json.dump(description, f, indent=4)
+    print(f'Description saved to {description_path}')
+    print(description)
+
+def main():
+    # Load and preprocess data
+    data, df = load_and_preprocess_data()
+
+    # Add temporal features
+    data_with_features = add_temporal_features(data, df)
+
+    # Save processed data
+    save_data(data_with_features)
+
+    # Save dataset description
+    save_description(data_with_features)
+
+if __name__ == '__main__':
+    main()
--- a/scripts/data_preparation/ETTm2/generate_training_data.py
+++ b/scripts/data_preparation/ETTm2/generate_training_data.py
@ -0,0 +1,121 @@
+import json
+import os
+
+import numpy as np
+import pandas as pd
+
+# Hyperparameters
+dataset_name = 'ETTm2'
+data_file_path = f'datasets/raw_data/{dataset_name}/{dataset_name}.csv'
+graph_file_path = None
+output_dir = f'datasets/{dataset_name}'
+target_channel = [0]  # Target traffic flow channel
+add_time_of_day = True  # Add time of day as a feature
+add_day_of_week = True  # Add day of the week as a feature
+add_day_of_month = True  # Add day of the month as a feature
+add_day_of_year = True  # Add day of the year as a feature
+steps_per_day = 96  # Number of time steps per day
+frequency = 1440 // steps_per_day
+domain = 'electricity transformer temperature'
+feature_description = [domain, 'time of day', 'day of week', 'day of month', 'day of year']
+regular_settings = {
+    'INPUT_LEN': 336,
+    'OUTPUT_LEN': 336,
+    'TRAIN_VAL_TEST_RATIO': [0.6, 0.2, 0.2],
+    'NORM_EACH_CHANNEL': True,
+    'RESCALE': False,
+    'METRICS': ['MAE', 'MSE'],
+    'NULL_VAL': np.nan
+}
+
+def load_and_preprocess_data():
+    '''Load and preprocess raw data, selecting the specified channel(s).'''
+    df = pd.read_csv(data_file_path)
+    df = df.iloc[:20*30*24*4]
+    df_index = pd.to_datetime(df['date'].values, format='%Y-%m-%d %H:%M:%S').to_numpy()
+    df = df[df.columns[1:]]
+    df.index = df_index
+    data = np.expand_dims(df.values, axis=-1)
+    data = data[..., target_channel]
+    print(f'Raw time series shape: {data.shape}')
+    return data, df
+
+def add_temporal_features(data, df):
+    '''Add time of day and day of week as features to the data.'''
+    l, n, _ = data.shape
+    feature_list = [data]
+
+    if add_time_of_day:
+        # numerical time_of_day
+        tod = [i % steps_per_day / steps_per_day for i in range(l)]
+        tod = np.array(tod)
+        tod_tiled = np.tile(tod, [1, n, 1]).transpose((2, 1, 0))
+        feature_list.append(tod_tiled)
+
+    if add_day_of_week:
+        # numerical day_of_week
+        dow = df.index.dayofweek / 7
+        dow_tiled = np.tile(dow, [1, n, 1]).transpose((2, 1, 0))
+        feature_list.append(dow_tiled)
+
+    if add_day_of_month:
+        # numerical day_of_month
+        dom = (df.index.day - 1) / 31 # df.index.day starts from 1. We need to minus 1 to make it start from 0.
+        dom_tiled = np.tile(dom, [1, n, 1]).transpose((2, 1, 0))
+        feature_list.append(dom_tiled)
+
+    if add_day_of_year:
+        # numerical day_of_year
+        doy = (df.index.dayofyear - 1) / 366 # df.index.month starts from 1. We need to minus 1 to make it start from 0.
+        doy_tiled = np.tile(doy, [1, n, 1]).transpose((2, 1, 0))
+        feature_list.append(doy_tiled)
+
+    data_with_features = np.concatenate(feature_list, axis=-1)  # L x N x C
+    return data_with_features
+
+def save_data(data):
+    '''Save the preprocessed data to a binary file.'''
+    if not os.path.exists(output_dir):
+        os.makedirs(output_dir)
+    file_path = os.path.join(output_dir, 'data.dat')
+    fp = np.memmap(file_path, dtype='float32', mode='w+', shape=data.shape)
+    fp[:] = data[:]
+    fp.flush()
+    del fp
+    print(f'Data saved to {file_path}')
+
+def save_description(data):
+    '''Save a description of the dataset to a JSON file.'''
+    description = {
+        'name': dataset_name,
+        'domain': domain,
+        'shape': data.shape,
+        'num_time_steps': data.shape[0],
+        'num_nodes': data.shape[1],
+        'num_features': data.shape[2],
+        'feature_description': feature_description,
+        'has_graph': graph_file_path is not None,
+        'frequency (minutes)': frequency,
+        'regular_settings': regular_settings
+    }
+    description_path = os.path.join(output_dir, 'desc.json')
+    with open(description_path, 'w') as f:
+        json.dump(description, f, indent=4)
+    print(f'Description saved to {description_path}')
+    print(description)
+
+def main():
+    # Load and preprocess data
+    data, df = load_and_preprocess_data()
+
+    # Add temporal features
+    data_with_features = add_temporal_features(data, df)
+
+    # Save processed data
+    save_data(data_with_features)
+
+    # Save dataset description
+    save_description(data_with_features)
+
+if __name__ == '__main__':
+    main()
--- a/scripts/data_preparation/Electricity/generate_training_data.py
+++ b/scripts/data_preparation/Electricity/generate_training_data.py
@ -0,0 +1,120 @@
+import json
+import os
+
+import numpy as np
+import pandas as pd
+
+# Hyperparameters
+dataset_name = 'Electricity'
+data_file_path = f'datasets/raw_data/{dataset_name}/{dataset_name}.csv'
+graph_file_path = None
+output_dir = f'datasets/{dataset_name}'
+target_channel = [0]  # Target traffic flow channel
+add_time_of_day = True  # Add time of day as a feature
+add_day_of_week = True  # Add day of the week as a feature
+add_day_of_month = True  # Add day of the month as a feature
+add_day_of_year = True  # Add day of the year as a feature
+steps_per_day = 24  # Number of time steps per day
+frequency = 1440 // steps_per_day
+domain = 'electricity consumption'
+feature_description = [domain, 'time of day', 'day of week', 'day of month', 'day of year']
+regular_settings = {
+    'INPUT_LEN': 336,
+    'OUTPUT_LEN': 336,
+    'TRAIN_VAL_TEST_RATIO': [0.7, 0.1, 0.2],
+    'NORM_EACH_CHANNEL': True,
+    'RESCALE': False,
+    'METRICS': ['MAE', 'MSE'],
+    'NULL_VAL': np.nan
+}
+
+def load_and_preprocess_data():
+    '''Load and preprocess raw data, selecting the specified channel(s).'''
+    df = pd.read_csv(data_file_path)
+    df_index = pd.to_datetime(df['date'].values, format='%Y-%m-%d %H:%M:%S').to_numpy()
+    df = df[df.columns[1:]]
+    df.index = df_index
+    data = np.expand_dims(df.values, axis=-1)
+    data = data[..., target_channel]
+    print(f'Raw time series shape: {data.shape}')
+    return data, df
+
+def add_temporal_features(data, df):
+    '''Add time of day and day of week as features to the data.'''
+    l, n, _ = data.shape
+    feature_list = [data]
+
+    if add_time_of_day:
+        # numerical time_of_day
+        tod = [i % steps_per_day / steps_per_day for i in range(l)]
+        tod = np.array(tod)
+        tod_tiled = np.tile(tod, [1, n, 1]).transpose((2, 1, 0))
+        feature_list.append(tod_tiled)
+
+    if add_day_of_week:
+        # numerical day_of_week
+        dow = df.index.dayofweek / 7
+        dow_tiled = np.tile(dow, [1, n, 1]).transpose((2, 1, 0))
+        feature_list.append(dow_tiled)
+
+    if add_day_of_month:
+        # numerical day_of_month
+        dom = (df.index.day - 1) / 31 # df.index.day starts from 1. We need to minus 1 to make it start from 0.
+        dom_tiled = np.tile(dom, [1, n, 1]).transpose((2, 1, 0))
+        feature_list.append(dom_tiled)
+
+    if add_day_of_year:
+        # numerical day_of_year
+        doy = (df.index.dayofyear - 1) / 366 # df.index.month starts from 1. We need to minus 1 to make it start from 0.
+        doy_tiled = np.tile(doy, [1, n, 1]).transpose((2, 1, 0))
+        feature_list.append(doy_tiled)
+
+    data_with_features = np.concatenate(feature_list, axis=-1)  # L x N x C
+    return data_with_features
+
+def save_data(data):
+    '''Save the preprocessed data to a binary file.'''
+    if not os.path.exists(output_dir):
+        os.makedirs(output_dir)
+    file_path = os.path.join(output_dir, 'data.dat')
+    fp = np.memmap(file_path, dtype='float32', mode='w+', shape=data.shape)
+    fp[:] = data[:]
+    fp.flush()
+    del fp
+    print(f'Data saved to {file_path}')
+
+def save_description(data):
+    '''Save a description of the dataset to a JSON file.'''
+    description = {
+        'name': dataset_name,
+        'domain': domain,
+        'shape': data.shape,
+        'num_time_steps': data.shape[0],
+        'num_nodes': data.shape[1],
+        'num_features': data.shape[2],
+        'feature_description': feature_description,
+        'has_graph': graph_file_path is not None,
+        'frequency (minutes)': frequency,
+        'regular_settings': regular_settings
+    }
+    description_path = os.path.join(output_dir, 'desc.json')
+    with open(description_path, 'w') as f:
+        json.dump(description, f, indent=4)
+    print(f'Description saved to {description_path}')
+    print(description)
+
+def main():
+    # Load and preprocess data
+    data, df = load_and_preprocess_data()
+
+    # Add temporal features
+    data_with_features = add_temporal_features(data, df)
+
+    # Save processed data
+    save_data(data_with_features)
+
+    # Save dataset description
+    save_description(data_with_features)
+
+if __name__ == '__main__':
+    main()
--- a/scripts/data_preparation/ExchangeRate/generate_training_data.py
+++ b/scripts/data_preparation/ExchangeRate/generate_training_data.py
@ -0,0 +1,120 @@
+import json
+import os
+
+import numpy as np
+import pandas as pd
+
+# Hyperparameters
+dataset_name = 'ExchangeRate'
+data_file_path = f'datasets/raw_data/{dataset_name}/{dataset_name}.csv'
+graph_file_path = None
+output_dir = f'datasets/{dataset_name}'
+target_channel = [0]  # Target traffic flow channel
+add_time_of_day = True  # Add time of day as a feature
+add_day_of_week = True  # Add day of the week as a feature
+add_day_of_month = True  # Add day of the month as a feature
+add_day_of_year = True  # Add day of the year as a feature
+steps_per_day = 1  # Number of time steps per day
+frequency = 1440 // steps_per_day
+domain = 'exchange rate'
+feature_description = [domain, 'time of day', 'day of week', 'day of month', 'day of year']
+regular_settings = {
+    'INPUT_LEN': 336,
+    'OUTPUT_LEN': 336,
+    'TRAIN_VAL_TEST_RATIO': [0.7, 0.1, 0.2],
+    'NORM_EACH_CHANNEL': True,
+    'RESCALE': False,
+    'METRICS': ['MAE', 'MSE'],
+    'NULL_VAL': np.nan
+}
+
+def load_and_preprocess_data():
+    '''Load and preprocess raw data, selecting the specified channel(s).'''
+    df = pd.read_csv(data_file_path)
+    df_index = pd.to_datetime(df['date'].values, format='%Y/%m/%d %H:%M').to_numpy()
+    df = df[df.columns[1:]]
+    df.index = df_index
+    data = np.expand_dims(df.values, axis=-1)
+    data = data[..., target_channel]
+    print(f'Raw time series shape: {data.shape}')
+    return data, df
+
+def add_temporal_features(data, df):
+    '''Add time of day and day of week as features to the data.'''
+    l, n, _ = data.shape
+    feature_list = [data]
+
+    if add_time_of_day:
+        # numerical time_of_day
+        tod = [i % steps_per_day / steps_per_day for i in range(l)]
+        tod = np.array(tod)
+        tod_tiled = np.tile(tod, [1, n, 1]).transpose((2, 1, 0))
+        feature_list.append(tod_tiled)
+
+    if add_day_of_week:
+        # numerical day_of_week
+        dow = df.index.dayofweek / 7
+        dow_tiled = np.tile(dow, [1, n, 1]).transpose((2, 1, 0))
+        feature_list.append(dow_tiled)
+
+    if add_day_of_month:
+        # numerical day_of_month
+        dom = (df.index.day - 1) / 31 # df.index.day starts from 1. We need to minus 1 to make it start from 0.
+        dom_tiled = np.tile(dom, [1, n, 1]).transpose((2, 1, 0))
+        feature_list.append(dom_tiled)
+
+    if add_day_of_year:
+        # numerical day_of_year
+        doy = (df.index.dayofyear - 1) / 366 # df.index.month starts from 1. We need to minus 1 to make it start from 0.
+        doy_tiled = np.tile(doy, [1, n, 1]).transpose((2, 1, 0))
+        feature_list.append(doy_tiled)
+
+    data_with_features = np.concatenate(feature_list, axis=-1)  # L x N x C
+    return data_with_features
+
+def save_data(data):
+    '''Save the preprocessed data to a binary file.'''
+    if not os.path.exists(output_dir):
+        os.makedirs(output_dir)
+    file_path = os.path.join(output_dir, 'data.dat')
+    fp = np.memmap(file_path, dtype='float32', mode='w+', shape=data.shape)
+    fp[:] = data[:]
+    fp.flush()
+    del fp
+    print(f'Data saved to {file_path}')
+
+def save_description(data):
+    '''Save a description of the dataset to a JSON file.'''
+    description = {
+        'name': dataset_name,
+        'domain': domain,
+        'shape': data.shape,
+        'num_time_steps': data.shape[0],
+        'num_nodes': data.shape[1],
+        'num_features': data.shape[2],
+        'feature_description': feature_description,
+        'has_graph': graph_file_path is not None,
+        'frequency (minutes)': frequency,
+        'regular_settings': regular_settings
+    }
+    description_path = os.path.join(output_dir, 'desc.json')
+    with open(description_path, 'w') as f:
+        json.dump(description, f, indent=4)
+    print(f'Description saved to {description_path}')
+    print(description)
+
+def main():
+    # Load and preprocess data
+    data, df = load_and_preprocess_data()
+
+    # Add temporal features
+    data_with_features = add_temporal_features(data, df)
+
+    # Save processed data
+    save_data(data_with_features)
+
+    # Save dataset description
+    save_description(data_with_features)
+
+if __name__ == '__main__':
+    main()
--- a/scripts/data_preparation/GBA/generate_training_data.py
+++ b/scripts/data_preparation/GBA/generate_training_data.py
@ -0,0 +1,136 @@
+import json
+import os
+import pickle
+import shutil
+
+import numpy as np
+import pandas as pd
+
+# Hyperparameters
+dataset_name = 'GBA'
+data_file_path = f'datasets/raw_data/{dataset_name}/{dataset_name}.h5'
+graph_file_path = f'datasets/raw_data/{dataset_name}/adj_{dataset_name}.npy'
+meta_file_path = f'datasets/raw_data/{dataset_name}/meta_{dataset_name}.csv'
+output_dir = f'datasets/{dataset_name}'
+target_channel = [0]  # Target traffic flow channel
+add_time_of_day = True  # Add time of day as a feature
+add_day_of_week = True  # Add day of the week as a feature
+add_day_of_month = False  # Add day of the month as a feature
+add_day_of_year = False  # Add day of the year as a feature
+steps_per_day = 96  # Number of time steps per day
+frequency = 1440 // steps_per_day
+domain = 'traffic flow'
+feature_description = [domain, 'time of day', 'day of week']
+regular_settings = {
+    'INPUT_LEN': 12,
+    'OUTPUT_LEN': 12,
+    'TRAIN_VAL_TEST_RATIO': [0.6, 0.2, 0.2],
+    'NORM_EACH_CHANNEL': False,
+    'RESCALE': True,
+    'METRICS': ['MAE', 'RMSE', 'MAPE'],
+    'NULL_VAL': 0.0
+}
+
+def load_and_preprocess_data():
+    '''Load and preprocess raw data, selecting the specified channel(s).'''
+    df = pd.read_hdf(data_file_path)
+    data = np.expand_dims(df.values, axis=-1)
+    data = data[..., target_channel]
+    print(f'Raw time series shape: {data.shape}')
+    return data, df
+
+def add_temporal_features(data, df):
+    '''Add time of day and day of week as features to the data.'''
+    _, n, _ = data.shape
+    feature_list = [data]
+
+    if add_time_of_day:
+        time_of_day = (df.index.values - df.index.values.astype('datetime64[D]')) / np.timedelta64(1, 'D')
+        time_of_day_tiled = np.tile(time_of_day, [1, n, 1]).transpose((2, 1, 0))
+        feature_list.append(time_of_day_tiled)
+
+    if add_day_of_week:
+        day_of_week = df.index.dayofweek / 7
+        day_of_week_tiled = np.tile(day_of_week, [1, n, 1]).transpose((2, 1, 0))
+        feature_list.append(day_of_week_tiled)
+
+    if add_day_of_month:
+        # numerical day_of_month
+        day_of_month = (df.index.day - 1 ) / 31 # df.index.day starts from 1. We need to minus 1 to make it start from 0.
+        day_of_month_tiled = np.tile(day_of_month, [1, n, 1]).transpose((2, 1, 0))
+        feature_list.append(day_of_month_tiled)
+
+    if add_day_of_year:
+        # numerical day_of_year
+        day_of_year = (df.index.dayofyear - 1) / 366 # df.index.month starts from 1. We need to minus 1 to make it start from 0.
+        day_of_year_tiled = np.tile(day_of_year, [1, n, 1]).transpose((2, 1, 0))
+        feature_list.append(day_of_year_tiled)
+
+    data_with_features = np.concatenate(feature_list, axis=-1)  # L x N x C
+    return data_with_features
+
+def save_data(data):
+    '''Save the preprocessed data to a binary file.'''
+    if not os.path.exists(output_dir):
+        os.makedirs(output_dir)
+    file_path = os.path.join(output_dir, 'data.dat')
+    fp = np.memmap(file_path, dtype='float32', mode='w+', shape=data.shape)
+    fp[:] = data[:]
+    fp.flush()
+    del fp
+    print(f'Data saved to {file_path}')
+
+def save_graph():
+    '''Save the adjacency matrix to the output directory.'''
+    output_graph_path = os.path.join(output_dir, 'adj_mx.pkl')
+    adj_mx = np.load(graph_file_path)
+    with open(output_dir + '/adj_mx.pkl', 'wb') as f:
+        pickle.dump(adj_mx, f)
+    print(f'Adjacency matrix saved to {output_graph_path}')
+
+def save_meta_data():
+    '''Save the meta data to the output directory'''
+    output_meta_data_path = os.path.join(output_dir, 'meta.csv')
+    shutil.copyfile(meta_file_path, output_meta_data_path)
+
+def save_description(data):
+    '''Save a description of the dataset to a JSON file.'''
+    description = {
+        'name': dataset_name,
+        'domain': domain,
+        'shape': data.shape,
+        'num_time_steps': data.shape[0],
+        'num_nodes': data.shape[1],
+        'num_features': data.shape[2],
+        'feature_description': feature_description,
+        'has_graph': graph_file_path is not None,
+        'frequency (minutes)': frequency,
+        'regular_settings': regular_settings
+    }
+    description_path = os.path.join(output_dir, 'desc.json')
+    with open(description_path, 'w') as f:
+        json.dump(description, f, indent=4)
+    print(f'Description saved to {description_path}')
+    print(description)
+
+def main():
+    # Load and preprocess data
+    data, df = load_and_preprocess_data()
+
+    # Add temporal features
+    data_with_features = add_temporal_features(data, df)
+
+    # Save processed data
+    save_data(data_with_features)
+
+    # Copy and save adjacency matrix
+    save_graph()
+
+    # Copy and save meta data
+    save_meta_data()
+
+    # Save dataset description
+    save_description(data_with_features)
+
+if __name__ == '__main__':
+    main()
--- a/scripts/data_preparation/GLA/generate_training_data.py
+++ b/scripts/data_preparation/GLA/generate_training_data.py
@ -0,0 +1,136 @@
+import json
+import os
+import pickle
+import shutil
+
+import numpy as np
+import pandas as pd
+
+# Hyperparameters
+dataset_name = 'GLA'
+data_file_path = f'datasets/raw_data/{dataset_name}/{dataset_name}.h5'
+graph_file_path = f'datasets/raw_data/{dataset_name}/adj_{dataset_name}.npy'
+meta_file_path = f'datasets/raw_data/{dataset_name}/meta_{dataset_name}.csv'
+output_dir = f'datasets/{dataset_name}'
+target_channel = [0]  # Target traffic flow channel
+add_time_of_day = True  # Add time of day as a feature
+add_day_of_week = True  # Add day of the week as a feature
+add_day_of_month = False  # Add day of the month as a feature
+add_day_of_year = False  # Add day of the year as a feature
+steps_per_day = 96  # Number of time steps per day
+frequency = 1440 // steps_per_day
+domain = 'traffic flow'
+feature_description = [domain, 'time of day', 'day of week']
+regular_settings = {
+    'INPUT_LEN': 12,
+    'OUTPUT_LEN': 12,
+    'TRAIN_VAL_TEST_RATIO': [0.6, 0.2, 0.2],
+    'NORM_EACH_CHANNEL': False,
+    'RESCALE': True,
+    'METRICS': ['MAE', 'RMSE', 'MAPE'],
+    'NULL_VAL': 0.0
+}
+
+def load_and_preprocess_data():
+    '''Load and preprocess raw data, selecting the specified channel(s).'''
+    df = pd.read_hdf(data_file_path)
+    data = np.expand_dims(df.values, axis=-1)
+    data = data[..., target_channel]
+    print(f'Raw time series shape: {data.shape}')
+    return data, df
+
+def add_temporal_features(data, df):
+    '''Add time of day and day of week as features to the data.'''
+    _, n, _ = data.shape
+    feature_list = [data]
+
+    if add_time_of_day:
+        time_of_day = (df.index.values - df.index.values.astype('datetime64[D]')) / np.timedelta64(1, 'D')
+        time_of_day_tiled = np.tile(time_of_day, [1, n, 1]).transpose((2, 1, 0))
+        feature_list.append(time_of_day_tiled)
+
+    if add_day_of_week:
+        day_of_week = df.index.dayofweek / 7
+        day_of_week_tiled = np.tile(day_of_week, [1, n, 1]).transpose((2, 1, 0))
+        feature_list.append(day_of_week_tiled)
+
+    if add_day_of_month:
+        # numerical day_of_month
+        day_of_month = (df.index.day - 1 ) / 31 # df.index.day starts from 1. We need to minus 1 to make it start from 0.
+        day_of_month_tiled = np.tile(day_of_month, [1, n, 1]).transpose((2, 1, 0))
+        feature_list.append(day_of_month_tiled)
+
+    if add_day_of_year:
+        # numerical day_of_year
+        day_of_year = (df.index.dayofyear - 1) / 366 # df.index.month starts from 1. We need to minus 1 to make it start from 0.
+        day_of_year_tiled = np.tile(day_of_year, [1, n, 1]).transpose((2, 1, 0))
+        feature_list.append(day_of_year_tiled)
+
+    data_with_features = np.concatenate(feature_list, axis=-1)  # L x N x C
+    return data_with_features
+
+def save_data(data):
+    '''Save the preprocessed data to a binary file.'''
+    if not os.path.exists(output_dir):
+        os.makedirs(output_dir)
+    file_path = os.path.join(output_dir, 'data.dat')
+    fp = np.memmap(file_path, dtype='float32', mode='w+', shape=data.shape)
+    fp[:] = data[:]
+    fp.flush()
+    del fp
+    print(f'Data saved to {file_path}')
+
+def save_graph():
+    '''Save the adjacency matrix to the output directory.'''
+    output_graph_path = os.path.join(output_dir, 'adj_mx.pkl')
+    adj_mx = np.load(graph_file_path)
+    with open(output_dir + '/adj_mx.pkl', 'wb') as f:
+        pickle.dump(adj_mx, f)
+    print(f'Adjacency matrix saved to {output_graph_path}')
+
+def save_meta_data():
+    '''Save the meta data to the output directory'''
+    output_meta_data_path = os.path.join(output_dir, 'meta.csv')
+    shutil.copyfile(meta_file_path, output_meta_data_path)
+
+def save_description(data):
+    '''Save a description of the dataset to a JSON file.'''
+    description = {
+        'name': dataset_name,
+        'domain': domain,
+        'shape': data.shape,
+        'num_time_steps': data.shape[0],
+        'num_nodes': data.shape[1],
+        'num_features': data.shape[2],
+        'feature_description': feature_description,
+        'has_graph': graph_file_path is not None,
+        'frequency (minutes)': frequency,
+        'regular_settings': regular_settings
+    }
+    description_path = os.path.join(output_dir, 'desc.json')
+    with open(description_path, 'w') as f:
+        json.dump(description, f, indent=4)
+    print(f'Description saved to {description_path}')
+    print(description)
+
+def main():
+    # Load and preprocess data
+    data, df = load_and_preprocess_data()
+
+    # Add temporal features
+    data_with_features = add_temporal_features(data, df)
+
+    # Save processed data
+    save_data(data_with_features)
+
+    # Copy and save adjacency matrix
+    save_graph()
+
+    # Copy and save meta data
+    save_meta_data()
+
+    # Save dataset description
+    save_description(data_with_features)
+
+if __name__ == '__main__':
+    main()
--- a/scripts/data_preparation/Gaussian/generate_training_data.py
+++ b/scripts/data_preparation/Gaussian/generate_training_data.py
@ -0,0 +1,73 @@
+import json
+import os
+
+import numpy as np
+
+# Hyperparameters
+dataset_name = 'Gaussian'
+data_file_path = f'datasets/raw_data/{dataset_name}/{dataset_name}.npy'
+graph_file_path = None
+output_dir = f'datasets/{dataset_name}'
+target_channel = [0]  # Target traffic flow channel
+frequency = None
+domain = 'simulated Gaussian data'
+feature_description = [domain]
+regular_settings = {
+    'INPUT_LEN': 336,
+    'OUTPUT_LEN': 336,
+    'TRAIN_VAL_TEST_RATIO': [0.7, 0.1, 0.2],
+    'NORM_EACH_CHANNEL': False,
+    'RESCALE': True,
+    'NULL_VAL': np.nan
+}
+
+def load_and_preprocess_data():
+    '''Load and preprocess raw data, selecting the specified channel(s).'''
+    data = np.load(data_file_path)
+    data = data[..., target_channel]
+    print(f'Raw time series shape: {data.shape}')
+    return data
+
+def save_data(data):
+    '''Save the preprocessed data to a binary file.'''
+    if not os.path.exists(output_dir):
+        os.makedirs(output_dir)
+    file_path = os.path.join(output_dir, 'data.dat')
+    fp = np.memmap(file_path, dtype='float32', mode='w+', shape=data.shape)
+    fp[:] = data[:]
+    fp.flush()
+    del fp
+    print(f'Data saved to {file_path}')
+
+def save_description(data):
+    '''Save a description of the dataset to a JSON file.'''
+    description = {
+        'name': dataset_name,
+        'domain': domain,
+        'shape': data.shape,
+        'num_time_steps': data.shape[0],
+        'num_nodes': data.shape[1],
+        'num_features': data.shape[2],
+        'feature_description': feature_description,
+        'has_graph': graph_file_path is not None,
+        'frequency (minutes)': frequency,
+        'settings': regular_settings
+    }
+    description_path = os.path.join(output_dir, 'desc.json')
+    with open(description_path, 'w') as f:
+        json.dump(description, f, indent=4)
+    print(f'Description saved to {description_path}')
+    print(description)
+
+def main():
+    # Load and preprocess data
+    data = load_and_preprocess_data()
+
+    # Save processed data
+    save_data(data)
+
+    # Save dataset description
+    save_description(data)
+
+if __name__ == '__main__':
+    main()
--- a/scripts/data_preparation/Gaussian/simulate_data.py
+++ b/scripts/data_preparation/Gaussian/simulate_data.py
@ -0,0 +1,26 @@
+import os
+
+import numpy as np
+import torch
+
+PROJECT_DIR = os.path.abspath(__file__ + '/../../../..')
+os.chdir(PROJECT_DIR)
+
+
+# hyper parameterts
+duration = 10000  # time series length
+
+def generate_gaussian_noise_sequence():
+    x = np.arange(0, duration, 1)
+    y = np.random.normal(0, 1, duration)
+    return x, y
+
+# generate gaussian sequence
+time_points, gaussian_noise_sequence = generate_gaussian_noise_sequence()
+
+# save pulse sequence
+data = torch.Tensor(gaussian_noise_sequence).unsqueeze(-1).unsqueeze(-1).numpy()
+# mkdir datasets/raw_data/Gaussian
+if not os.path.exists('datasets/raw_data/Gaussian'):
+    os.makedirs('datasets/raw_data/Gaussian')
+np.save('datasets/raw_data/Gaussian/Gaussian.npy', data)
--- a/scripts/data_preparation/GlobalTemp/generate_training_data.py
+++ b/scripts/data_preparation/GlobalTemp/generate_training_data.py
@ -0,0 +1,121 @@
+import json
+import os
+
+import numpy as np
+import pandas as pd
+
+# Hyperparameters
+dataset_name = 'GlobalTemp'
+data_file_path = f'datasets/raw_data/{dataset_name}/{dataset_name}.csv'
+graph_file_path = None
+output_dir = f'datasets/{dataset_name}'
+target_channel = [0]  # Target traffic flow channel
+add_time_of_day = True  # Add time of day as a feature
+add_day_of_week = True  # Add day of the week as a feature
+add_day_of_month = True  # Add day of the month as a feature
+add_day_of_year = True  # Add day of the year as a feature
+steps_per_day = 24  # Number of time steps per day
+frequency = 1440 // steps_per_day # minutes
+domain = 'global temperature'
+feature_description = [domain, 'time of day', 'day of week', 'day of month', 'day of year']
+regular_settings = {
+    'INPUT_LEN': 48,
+    'OUTPUT_LEN': 24,
+    'TRAIN_VAL_TEST_RATIO': [0.7, 0.1, 0.2],
+    'NORM_EACH_CHANNEL': False,
+    'RESCALE': True,
+    'METRICS': ['MAE', 'MSE'],
+    'NULL_VAL': np.nan
+}
+
+def load_and_preprocess_data():
+    '''Load and preprocess raw data, selecting the specified channel(s).'''
+    df = pd.read_csv(data_file_path)
+    df_index = pd.to_datetime(df['date'].values, format='%Y/%m/%d %H:%M').to_numpy()
+    df = df[df.columns[1:]]
+    df.index = df_index
+    data = np.expand_dims(df.values, axis=-1)
+    data = data[..., target_channel]
+    # data = data / 10
+    print(f'Raw time series shape: {data.shape}')
+    return data, df
+
+def add_temporal_features(data, df):
+    '''Add time of day and day of week as features to the data.'''
+    l, n, _ = data.shape
+    feature_list = [data]
+
+    if add_time_of_day:
+        # numerical time_of_day
+        tod = [i % steps_per_day / steps_per_day for i in range(l)]
+        tod = np.array(tod)
+        tod_tiled = np.tile(tod, [1, n, 1]).transpose((2, 1, 0))
+        feature_list.append(tod_tiled)
+
+    if add_day_of_week:
+        # numerical day_of_week
+        dow = df.index.dayofweek / 7
+        dow_tiled = np.tile(dow, [1, n, 1]).transpose((2, 1, 0))
+        feature_list.append(dow_tiled)
+
+    if add_day_of_month:
+        # numerical day_of_month
+        dom = (df.index.day - 1) / 31 # df.index.day starts from 1. We need to minus 1 to make it start from 0.
+        dom_tiled = np.tile(dom, [1, n, 1]).transpose((2, 1, 0))
+        feature_list.append(dom_tiled)
+
+    if add_day_of_year:
+        # numerical day_of_year
+        doy = (df.index.dayofyear - 1) / 366 # df.index.month starts from 1. We need to minus 1 to make it start from 0.
+        doy_tiled = np.tile(doy, [1, n, 1]).transpose((2, 1, 0))
+        feature_list.append(doy_tiled)
+
+    data_with_features = np.concatenate(feature_list, axis=-1)  # L x N x C
+    return data_with_features
+
+def save_data(data):
+    '''Save the preprocessed data to a binary file.'''
+    if not os.path.exists(output_dir):
+        os.makedirs(output_dir)
+    file_path = os.path.join(output_dir, 'data.dat')
+    fp = np.memmap(file_path, dtype='float32', mode='w+', shape=data.shape)
+    fp[:] = data[:]
+    fp.flush()
+    del fp
+    print(f'Data saved to {file_path}')
+
+def save_description(data):
+    '''Save a description of the dataset to a JSON file.'''
+    description = {
+        'name': dataset_name,
+        'domain': domain,
+        'shape': data.shape,
+        'num_time_steps': data.shape[0],
+        'num_nodes': data.shape[1],
+        'num_features': data.shape[2],
+        'feature_description': feature_description,
+        'has_graph': graph_file_path is not None,
+        'frequency (minutes)': frequency,
+        'regular_settings': regular_settings
+    }
+    description_path = os.path.join(output_dir, 'desc.json')
+    with open(description_path, 'w') as f:
+        json.dump(description, f, indent=4)
+    print(f'Description saved to {description_path}')
+    print(description)
+
+def main():
+    # Load and preprocess data
+    data, df = load_and_preprocess_data()
+
+    # Add temporal features
+    data_with_features = add_temporal_features(data, df)
+
+    # Save processed data
+    save_data(data_with_features)
+
+    # Save dataset description
+    save_description(data_with_features)
+
+if __name__ == '__main__':
+    main()
--- a/scripts/data_preparation/GlobalWind/generate_training_data.py
+++ b/scripts/data_preparation/GlobalWind/generate_training_data.py
@ -0,0 +1,121 @@
+import json
+import os
+
+import numpy as np
+import pandas as pd
+
+# Hyperparameters
+dataset_name = 'GlobalWind'
+data_file_path = f'datasets/raw_data/{dataset_name}/{dataset_name}.csv'
+graph_file_path = None
+output_dir = f'datasets/{dataset_name}'
+target_channel = [0]  # Target traffic flow channel
+add_time_of_day = True  # Add time of day as a feature
+add_day_of_week = True  # Add day of the week as a feature
+add_day_of_month = True  # Add day of the month as a feature
+add_day_of_year = True  # Add day of the year as a feature
+steps_per_day = 24  # Number of time steps per day
+frequency = 1440 // steps_per_day # minutes
+domain = 'global wind'
+feature_description = [domain, 'time of day', 'day of week', 'day of month', 'day of year']
+regular_settings = {
+    'INPUT_LEN': 48,
+    'OUTPUT_LEN': 24,
+    'TRAIN_VAL_TEST_RATIO': [0.7, 0.1, 0.2],
+    'NORM_EACH_CHANNEL': False,
+    'RESCALE': True,
+    'METRICS': ['MAE', 'MSE'],
+    'NULL_VAL': np.nan
+}
+
+def load_and_preprocess_data():
+    '''Load and preprocess raw data, selecting the specified channel(s).'''
+    df = pd.read_csv(data_file_path)
+    df_index = pd.to_datetime(df['date'].values, format='%Y-%m-%d %H:%M:%S').to_numpy()
+    df = df[df.columns[1:]]
+    df.index = df_index
+    data = np.expand_dims(df.values, axis=-1)
+    data = data[..., target_channel]
+    data = data / 10
+    print(f'Raw time series shape: {data.shape}')
+    return data, df
+
+def add_temporal_features(data, df):
+    '''Add time of day and day of week as features to the data.'''
+    l, n, _ = data.shape
+    feature_list = [data]
+
+    if add_time_of_day:
+        # numerical time_of_day
+        tod = [i % steps_per_day / steps_per_day for i in range(l)]
+        tod = np.array(tod)
+        tod_tiled = np.tile(tod, [1, n, 1]).transpose((2, 1, 0))
+        feature_list.append(tod_tiled)
+
+    if add_day_of_week:
+        # numerical day_of_week
+        dow = df.index.dayofweek / 7
+        dow_tiled = np.tile(dow, [1, n, 1]).transpose((2, 1, 0))
+        feature_list.append(dow_tiled)
+
+    if add_day_of_month:
+        # numerical day_of_month
+        dom = (df.index.day - 1) / 31 # df.index.day starts from 1. We need to minus 1 to make it start from 0.
+        dom_tiled = np.tile(dom, [1, n, 1]).transpose((2, 1, 0))
+        feature_list.append(dom_tiled)
+
+    if add_day_of_year:
+        # numerical day_of_year
+        doy = (df.index.dayofyear - 1) / 366 # df.index.month starts from 1. We need to minus 1 to make it start from 0.
+        doy_tiled = np.tile(doy, [1, n, 1]).transpose((2, 1, 0))
+        feature_list.append(doy_tiled)
+
+    data_with_features = np.concatenate(feature_list, axis=-1)  # L x N x C
+    return data_with_features
+
+def save_data(data):
+    '''Save the preprocessed data to a binary file.'''
+    if not os.path.exists(output_dir):
+        os.makedirs(output_dir)
+    file_path = os.path.join(output_dir, 'data.dat')
+    fp = np.memmap(file_path, dtype='float32', mode='w+', shape=data.shape)
+    fp[:] = data[:]
+    fp.flush()
+    del fp
+    print(f'Data saved to {file_path}')
+
+def save_description(data):
+    '''Save a description of the dataset to a JSON file.'''
+    description = {
+        'name': dataset_name,
+        'domain': domain,
+        'shape': data.shape,
+        'num_time_steps': data.shape[0],
+        'num_nodes': data.shape[1],
+        'num_features': data.shape[2],
+        'feature_description': feature_description,
+        'has_graph': graph_file_path is not None,
+        'frequency (minutes)': frequency,
+        'regular_settings': regular_settings
+    }
+    description_path = os.path.join(output_dir, 'desc.json')
+    with open(description_path, 'w') as f:
+        json.dump(description, f, indent=4)
+    print(f'Description saved to {description_path}')
+    print(description)
+
+def main():
+    # Load and preprocess data
+    data, df = load_and_preprocess_data()
+
+    # Add temporal features
+    data_with_features = add_temporal_features(data, df)
+
+    # Save processed data
+    save_data(data_with_features)
+
+    # Save dataset description
+    save_description(data_with_features)
+
+if __name__ == '__main__':
+    main()
--- a/scripts/data_preparation/Illness/generate_training_data.py
+++ b/scripts/data_preparation/Illness/generate_training_data.py
@ -0,0 +1,120 @@
+import json
+import os
+
+import numpy as np
+import pandas as pd
+
+# Hyperparameters
+dataset_name = 'Illness'
+data_file_path = f'datasets/raw_data/{dataset_name}/{dataset_name}.csv'
+graph_file_path = None
+output_dir = f'datasets/{dataset_name}'
+target_channel = [0]  # Target traffic flow channel
+add_time_of_day = True  # Add time of day as a feature
+add_day_of_week = True  # Add day of the week as a feature
+add_day_of_month = True  # Add day of the month as a feature
+add_day_of_year = True  # Add day of the year as a feature
+steps_per_day = 1/7  # Number of time steps per day
+frequency = 1440 // steps_per_day
+domain = 'illness data'
+feature_description = [domain, 'time of day', 'day of week', 'day of month', 'day of year']
+regular_settings = {
+    'INPUT_LEN': 96,
+    'OUTPUT_LEN': 48,
+    'TRAIN_VAL_TEST_RATIO': [0.7, 0.1, 0.2],
+    'NORM_EACH_CHANNEL': True,
+    'RESCALE': False,
+    'METRICS': ['MAE', 'MSE'],
+    'NULL_VAL': np.nan
+}
+
+def load_and_preprocess_data():
+    '''Load and preprocess raw data, selecting the specified channel(s).'''
+    df = pd.read_csv(data_file_path)
+    df_index = pd.to_datetime(df['date'].values, format='%Y-%m-%d %H:%M:%S').to_numpy()
+    df = df[df.columns[1:]]
+    df.index = df_index
+    data = np.expand_dims(df.values, axis=-1)
+    data = data[..., target_channel]
+    print(f'Raw time series shape: {data.shape}')
+    return data, df
+
+def add_temporal_features(data, df):
+    '''Add time of day and day of week as features to the data.'''
+    _, n, _ = data.shape
+    feature_list = [data]
+
+    if add_time_of_day:
+        # numerical time_of_day
+        tod = (
+            df.index.values - df.index.values.astype('datetime64[D]')) / np.timedelta64(1, 'D')
+        tod_tiled = np.tile(tod, [1, n, 1]).transpose((2, 1, 0))
+        feature_list.append(tod_tiled)
+
+    if add_day_of_week:
+        # numerical day_of_week
+        dow = df.index.dayofweek / 7
+        dow_tiled = np.tile(dow, [1, n, 1]).transpose((2, 1, 0))
+        feature_list.append(dow_tiled)
+
+    if add_day_of_month:
+        # numerical day_of_month
+        dom = (df.index.day - 1) / 31 # df.index.day starts from 1. We need to minus 1 to make it start from 0.
+        dom_tiled = np.tile(dom, [1, n, 1]).transpose((2, 1, 0))
+        feature_list.append(dom_tiled)
+
+    if add_day_of_year:
+        # numerical day_of_year
+        doy = (df.index.dayofyear - 1) / 366 # df.index.month starts from 1. We need to minus 1 to make it start from 0.
+        doy_tiled = np.tile(doy, [1, n, 1]).transpose((2, 1, 0))
+        feature_list.append(doy_tiled)
+
+    data_with_features = np.concatenate(feature_list, axis=-1)  # L x N x C
+    return data_with_features
+
+def save_data(data):
+    '''Save the preprocessed data to a binary file.'''
+    if not os.path.exists(output_dir):
+        os.makedirs(output_dir)
+    file_path = os.path.join(output_dir, 'data.dat')
+    fp = np.memmap(file_path, dtype='float32', mode='w+', shape=data.shape)
+    fp[:] = data[:]
+    fp.flush()
+    del fp
+    print(f'Data saved to {file_path}')
+
+def save_description(data):
+    '''Save a description of the dataset to a JSON file.'''
+    description = {
+        'name': dataset_name,
+        'domain': domain,
+        'shape': data.shape,
+        'num_time_steps': data.shape[0],
+        'num_nodes': data.shape[1],
+        'num_features': data.shape[2],
+        'feature_description': feature_description,
+        'has_graph': graph_file_path is not None,
+        'frequency (minutes)': frequency,
+        'regular_settings': regular_settings
+    }
+    description_path = os.path.join(output_dir, 'desc.json')
+    with open(description_path, 'w') as f:
+        json.dump(description, f, indent=4)
+    print(f'Description saved to {description_path}')
+    print(description)
+
+def main():
+    # Load and preprocess data
+    data, df = load_and_preprocess_data()
+
+    # Add temporal features
+    data_with_features = add_temporal_features(data, df)
+
+    # Save processed data
+    save_data(data_with_features)
+
+    # Save dataset description
+    save_description(data_with_features)
+
+if __name__ == '__main__':
+    main()
--- a/scripts/data_preparation/JiNan/generate_adj_mx.py
+++ b/scripts/data_preparation/JiNan/generate_adj_mx.py
@ -0,0 +1,84 @@
+import csv
+import os
+import pickle
+
+import numpy as np
+
+
+def get_adjacency_matrix(distance_df_filename: str, num_of_vertices: int, id_filename: str = None) -> tuple:
+    """Generate adjacency matrix.
+
+    Args:
+        distance_df_filename (str): path of the csv file contains edges information
+        num_of_vertices (int): number of vertices
+        id_filename (str, optional): id filename. Defaults to None.
+
+    Returns:
+        tuple: two adjacency matrix.
+            np.array: connectivity-based adjacency matrix A (A[i, j]=0 or A[i, j]=1)
+            np.array: distance-based adjacency matrix A
+    """
+
+    if "npy" in distance_df_filename:
+        adj_mx = np.load(distance_df_filename)
+        return adj_mx, None
+    else:
+        adjacency_matrix_connectivity = np.zeros((int(num_of_vertices), int(
+            num_of_vertices)), dtype=np.float32)
+        adjacency_matrix_distance = np.zeros((int(num_of_vertices), int(num_of_vertices)),
+                                             dtype=np.float32)
+        if id_filename:
+            # the id in the distance file does not start from 0, so it needs to be remapped
+            with open(id_filename, "r") as f:
+                id_dict = {int(i): idx for idx, i in enumerate(
+                    f.read().strip().split("\n"))}  # map node idx to 0-based index (start from 0)
+            with open(distance_df_filename, "r") as f:
+                f.readline()  # omit the first line
+                reader = csv.reader(f)
+                for row in reader:
+                    if len(row) != 3:
+                        continue
+                    i, j, distance = int(row[0]), int(row[1]), float(row[2])
+                    adjacency_matrix_connectivity[id_dict[i], id_dict[j]] = 1
+                    adjacency_matrix_connectivity[id_dict[j], id_dict[i]] = 1
+                    adjacency_matrix_distance[id_dict[i],
+                                              id_dict[j]] = distance
+                    adjacency_matrix_distance[id_dict[j],
+                                              id_dict[i]] = distance
+            return adjacency_matrix_connectivity, adjacency_matrix_distance
+        else:
+            # ids in distance file start from 0
+            with open(distance_df_filename, "r") as f:
+                f.readline()
+                reader = csv.reader(f)
+                for row in reader:
+                    if len(row) != 3:
+                        continue
+                    i, j, distance = int(row[0]), int(row[1]), float(row[2])
+                    adjacency_matrix_connectivity[i, j] = 1
+                    adjacency_matrix_connectivity[j, i] = 1
+                    adjacency_matrix_distance[i, j] = distance
+                    adjacency_matrix_distance[j, i] = distance
+            return adjacency_matrix_connectivity, adjacency_matrix_distance
+
+
+def generate_adj_jinan():
+    distance_df_filename, num_of_vertices = "datasets/raw_data/JiNan/JiNan.csv", 406
+    if os.path.exists(distance_df_filename.split(".", maxsplit=1)[0] + ".txt"):
+        id_filename = distance_df_filename.split(".", maxsplit=1)[0] + ".txt"
+    else:
+        id_filename = None
+    adj_mx, distance_mx = get_adjacency_matrix(
+        distance_df_filename, num_of_vertices, id_filename=id_filename)
+    # the self loop is missing
+    add_self_loop = False
+    if add_self_loop:
+        print("adding self loop to adjacency matrices.")
+        adj_mx = adj_mx + np.identity(adj_mx.shape[0])
+        distance_mx = distance_mx + np.identity(distance_mx.shape[0])
+    else:
+        print("kindly note that there is no self loop in adjacency matrices.")
+    with open("datasets/raw_data/JiNan/adj_JiNan.pkl", "wb") as f:
+        pickle.dump(adj_mx, f)
+    with open("datasets/raw_data/JiNan/adj_JiNan_distance.pkl", "wb") as f:
+        pickle.dump(distance_mx, f)
--- a/scripts/data_preparation/JiNan/generate_training_data.py
+++ b/scripts/data_preparation/JiNan/generate_training_data.py
@ -0,0 +1,113 @@
+import json
+import os
+import shutil
+
+import numpy as np
+from generate_adj_mx import generate_adj_jinan as generate_adj
+
+# Hyperparameters
+dataset_name = 'JiNan'
+data_file_path = f'datasets/raw_data/{dataset_name}/{dataset_name}.npz'
+graph_file_path = f'datasets/raw_data/{dataset_name}/adj_{dataset_name}.pkl'
+output_dir = f'datasets/{dataset_name}'
+target_channel = [0]  # Target traffic flow channel
+add_time_of_day = True  # Add time of day as a feature
+add_day_of_week = True  # Add day of the week as a feature
+steps_per_day = 288  # Number of time steps per day
+frequency = 1440 // steps_per_day
+domain = 'traffic flow'
+feature_description = [domain, 'time of day', 'day of week']
+regular_settings = {
+    'INPUT_LEN': 12,
+    'OUTPUT_LEN': 12,
+    'TRAIN_VAL_TEST_RATIO': [0.6, 0.2, 0.2],
+    'NORM_EACH_CHANNEL': False,
+    'RESCALE': True,
+    'METRICS': ['MAE', 'RMSE', 'MAPE'],
+    'NULL_VAL': 0.0
+}
+
+def load_and_preprocess_data():
+    '''Load and preprocess raw data, selecting the specified channel(s).'''
+    data = np.load(data_file_path)['data']
+    data = data[..., target_channel]
+    print(f'Raw time series shape: {data.shape}')
+    return data
+
+def add_temporal_features(data):
+    '''Add time of day and day of week as features to the data.'''
+    l, n, _ = data.shape
+    feature_list = [data]
+
+    if add_time_of_day:
+        time_of_day = np.array([i % steps_per_day / steps_per_day for i in range(l)])
+        time_of_day_tiled = np.tile(time_of_day, [1, n, 1]).transpose((2, 1, 0))
+        feature_list.append(time_of_day_tiled)
+
+    if add_day_of_week:
+        day_of_week = np.array([(i // steps_per_day) % 7 / 7 for i in range(l)])
+        day_of_week_tiled = np.tile(day_of_week, [1, n, 1]).transpose((2, 1, 0))
+        feature_list.append(day_of_week_tiled)
+
+    data_with_features = np.concatenate(feature_list, axis=-1)  # L x N x C
+    return data_with_features
+
+def save_data(data):
+    '''Save the preprocessed data to a binary file.'''
+    if not os.path.exists(output_dir):
+        os.makedirs(output_dir)
+    file_path = os.path.join(output_dir, 'data.dat')
+    fp = np.memmap(file_path, dtype='float32', mode='w+', shape=data.shape)
+    fp[:] = data[:]
+    fp.flush()
+    del fp
+    print(f'Data saved to {file_path}')
+
+def save_graph():
+    '''Save the adjacency matrix to the output directory, generating it if necessary.'''
+    output_graph_path = os.path.join(output_dir, 'adj_mx.pkl')
+    if os.path.exists(graph_file_path):
+        shutil.copyfile(graph_file_path, output_graph_path)
+    else:
+        generate_adj()
+        shutil.copyfile(graph_file_path, output_graph_path)
+    print(f'Adjacency matrix saved to {output_graph_path}')
+
+def save_description(data):
+    '''Save a description of the dataset to a JSON file.'''
+    description = {
+        'name': dataset_name,
+        'domain': domain,
+        'shape': data.shape,
+        'num_time_steps': data.shape[0],
+        'num_nodes': data.shape[1],
+        'num_features': data.shape[2],
+        'feature_description': feature_description,
+        'has_graph': graph_file_path is not None,
+        'frequency (minutes)': frequency,
+        'regular_settings': regular_settings
+    }
+    description_path = os.path.join(output_dir, 'desc.json')
+    with open(description_path, 'w') as f:
+        json.dump(description, f, indent=4)
+    print(f'Description saved to {description_path}')
+    print(description)
+
+def main():
+    # Load and preprocess data
+    data = load_and_preprocess_data()
+
+    # Add temporal features
+    data_with_features = add_temporal_features(data)
+
+    # Save processed data
+    save_data(data_with_features)
+
+    # Copy or generate and save adjacency matrix
+    save_graph()
+
+    # Save dataset description
+    save_description(data_with_features)
+
+if __name__ == '__main__':
+    main()
--- a/scripts/data_preparation/METR-LA/generate_training_data.py
+++ b/scripts/data_preparation/METR-LA/generate_training_data.py
@ -0,0 +1,124 @@
+import json
+import os
+import shutil
+
+import numpy as np
+import pandas as pd
+
+# Hyperparameters
+dataset_name = 'METR-LA'
+data_file_path = f'datasets/raw_data/{dataset_name}/{dataset_name}.h5'
+graph_file_path = f'datasets/raw_data/{dataset_name}/adj_{dataset_name}.pkl'
+output_dir = f'datasets/{dataset_name}'
+target_channel = [0]  # Target traffic flow channel
+add_time_of_day = True  # Add time of day as a feature
+add_day_of_week = True  # Add day of the week as a feature
+add_day_of_month = False  # Add day of the month as a feature
+add_day_of_year = False  # Add day of the year as a feature
+steps_per_day = 288  # Number of time steps per day
+frequency = 1440 // steps_per_day
+domain = 'traffic speed'
+feature_description = [domain, 'time of day', 'day of week']
+regular_settings = {
+    'INPUT_LEN': 12,
+    'OUTPUT_LEN': 12,
+    'TRAIN_VAL_TEST_RATIO': [0.7, 0.1, 0.2],
+    'NORM_EACH_CHANNEL': False,
+    'RESCALE': True,
+    'METRICS': ['MAE', 'RMSE', 'MAPE'],
+    'NULL_VAL': 0.0
+}
+
+def load_and_preprocess_data():
+    '''Load and preprocess raw data, selecting the specified channel(s).'''
+    df = pd.read_hdf(data_file_path)
+    data = np.expand_dims(df.values, axis=-1)
+    data = data[..., target_channel]
+    print(f'Raw time series shape: {data.shape}')
+    return data, df
+
+def add_temporal_features(data, df):
+    '''Add time of day and day of week as features to the data.'''
+    _, n, _ = data.shape
+    feature_list = [data]
+
+    if add_time_of_day:
+        time_of_day = (df.index.values - df.index.values.astype('datetime64[D]')) / np.timedelta64(1, 'D')
+        time_of_day_tiled = np.tile(time_of_day, [1, n, 1]).transpose((2, 1, 0))
+        feature_list.append(time_of_day_tiled)
+
+    if add_day_of_week:
+        day_of_week = df.index.dayofweek / 7
+        day_of_week_tiled = np.tile(day_of_week, [1, n, 1]).transpose((2, 1, 0))
+        feature_list.append(day_of_week_tiled)
+
+    if add_day_of_month:
+        # numerical day_of_month
+        day_of_month = (df.index.day - 1 ) / 31 # df.index.day starts from 1. We need to minus 1 to make it start from 0.
+        day_of_month_tiled = np.tile(day_of_month, [1, n, 1]).transpose((2, 1, 0))
+        feature_list.append(day_of_month_tiled)
+
+    if add_day_of_year:
+        # numerical day_of_year
+        day_of_year = (df.index.dayofyear - 1) / 366 # df.index.month starts from 1. We need to minus 1 to make it start from 0.
+        day_of_year_tiled = np.tile(day_of_year, [1, n, 1]).transpose((2, 1, 0))
+        feature_list.append(day_of_year_tiled)
+
+    data_with_features = np.concatenate(feature_list, axis=-1)  # L x N x C
+    return data_with_features
+
+def save_data(data):
+    '''Save the preprocessed data to a binary file.'''
+    if not os.path.exists(output_dir):
+        os.makedirs(output_dir)
+    file_path = os.path.join(output_dir, 'data.dat')
+    fp = np.memmap(file_path, dtype='float32', mode='w+', shape=data.shape)
+    fp[:] = data[:]
+    fp.flush()
+    del fp
+    print(f'Data saved to {file_path}')
+
+def save_graph():
+    '''Save the adjacency matrix to the output directory.'''
+    output_graph_path = os.path.join(output_dir, 'adj_mx.pkl')
+    shutil.copyfile(graph_file_path, output_graph_path)
+    print(f'Adjacency matrix saved to {output_graph_path}')
+
+def save_description(data):
+    '''Save a description of the dataset to a JSON file.'''
+    description = {
+        'name': dataset_name,
+        'domain': domain,
+        'shape': data.shape,
+        'num_time_steps': data.shape[0],
+        'num_nodes': data.shape[1],
+        'num_features': data.shape[2],
+        'feature_description': feature_description,
+        'has_graph': graph_file_path is not None,
+        'frequency (minutes)': frequency,
+        'regular_settings': regular_settings
+    }
+    description_path = os.path.join(output_dir, 'desc.json')
+    with open(description_path, 'w') as f:
+        json.dump(description, f, indent=4)
+    print(f'Description saved to {description_path}')
+    print(description)
+
+def main():
+    # Load and preprocess data
+    data, df = load_and_preprocess_data()
+
+    # Add temporal features
+    data_with_features = add_temporal_features(data, df)
+
+    # Save processed data
+    save_data(data_with_features)
+
+    # Copy and save adjacency matrix
+    save_graph()
+
+    # Save dataset description
+    save_description(data_with_features)
+
+if __name__ == '__main__':
+    main()
--- a/scripts/data_preparation/PEMS-BAY/generate_training_data.py
+++ b/scripts/data_preparation/PEMS-BAY/generate_training_data.py
@ -0,0 +1,124 @@
+import json
+import os
+import shutil
+
+import numpy as np
+import pandas as pd
+
+# Hyperparameters
+dataset_name = 'PEMS-BAY'
+data_file_path = f'datasets/raw_data/{dataset_name}/{dataset_name}.h5'
+graph_file_path = f'datasets/raw_data/{dataset_name}/adj_{dataset_name}.pkl'
+output_dir = f'datasets/{dataset_name}'
+target_channel = [0]  # Target traffic flow channel
+add_time_of_day = True  # Add time of day as a feature
+add_day_of_week = True  # Add day of the week as a feature
+add_day_of_month = False  # Add day of the month as a feature
+add_day_of_year = False  # Add day of the year as a feature
+steps_per_day = 288  # Number of time steps per day
+frequency = 1440 // steps_per_day
+domain = 'traffic speed'
+feature_description = [domain, 'time of day', 'day of week']
+regular_settings = {
+    'INPUT_LEN': 12,
+    'OUTPUT_LEN': 12,
+    'TRAIN_VAL_TEST_RATIO': [0.7, 0.1, 0.2],
+    'NORM_EACH_CHANNEL': False,
+    'RESCALE': True,
+    'METRICS': ['MAE', 'RMSE', 'MAPE'],
+    'NULL_VAL': 0.0
+}
+
+def load_and_preprocess_data():
+    '''Load and preprocess raw data, selecting the specified channel(s).'''
+    df = pd.read_hdf(data_file_path)
+    data = np.expand_dims(df.values, axis=-1)
+    data = data[..., target_channel]
+    print(f'Raw time series shape: {data.shape}')
+    return data, df
+
+def add_temporal_features(data, df):
+    '''Add time of day and day of week as features to the data.'''
+    _, n, _ = data.shape
+    feature_list = [data]
+
+    if add_time_of_day:
+        time_of_day = (df.index.values - df.index.values.astype('datetime64[D]')) / np.timedelta64(1, 'D')
+        time_of_day_tiled = np.tile(time_of_day, [1, n, 1]).transpose((2, 1, 0))
+        feature_list.append(time_of_day_tiled)
+
+    if add_day_of_week:
+        day_of_week = df.index.dayofweek / 7
+        day_of_week_tiled = np.tile(day_of_week, [1, n, 1]).transpose((2, 1, 0))
+        feature_list.append(day_of_week_tiled)
+
+    if add_day_of_month:
+        # numerical day_of_month
+        day_of_month = (df.index.day - 1 ) / 31 # df.index.day starts from 1. We need to minus 1 to make it start from 0.
+        day_of_month_tiled = np.tile(day_of_month, [1, n, 1]).transpose((2, 1, 0))
+        feature_list.append(day_of_month_tiled)
+
+    if add_day_of_year:
+        # numerical day_of_year
+        day_of_year = (df.index.dayofyear - 1) / 366 # df.index.month starts from 1. We need to minus 1 to make it start from 0.
+        day_of_year_tiled = np.tile(day_of_year, [1, n, 1]).transpose((2, 1, 0))
+        feature_list.append(day_of_year_tiled)
+
+    data_with_features = np.concatenate(feature_list, axis=-1)  # L x N x C
+    return data_with_features
+
+def save_data(data):
+    '''Save the preprocessed data to a binary file.'''
+    if not os.path.exists(output_dir):
+        os.makedirs(output_dir)
+    file_path = os.path.join(output_dir, 'data.dat')
+    fp = np.memmap(file_path, dtype='float32', mode='w+', shape=data.shape)
+    fp[:] = data[:]
+    fp.flush()
+    del fp
+    print(f'Data saved to {file_path}')
+
+def save_graph():
+    '''Save the adjacency matrix to the output directory.'''
+    output_graph_path = os.path.join(output_dir, 'adj_mx.pkl')
+    shutil.copyfile(graph_file_path, output_graph_path)
+    print(f'Adjacency matrix saved to {output_graph_path}')
+
+def save_description(data):
+    '''Save a description of the dataset to a JSON file.'''
+    description = {
+        'name': dataset_name,
+        'domain': domain,
+        'shape': data.shape,
+        'num_time_steps': data.shape[0],
+        'num_nodes': data.shape[1],
+        'num_features': data.shape[2],
+        'feature_description': feature_description,
+        'has_graph': graph_file_path is not None,
+        'frequency (minutes)': frequency,
+        'regular_settings': regular_settings
+    }
+    description_path = os.path.join(output_dir, 'desc.json')
+    with open(description_path, 'w') as f:
+        json.dump(description, f, indent=4)
+    print(f'Description saved to {description_path}')
+    print(description)
+
+def main():
+    # Load and preprocess data
+    data, df = load_and_preprocess_data()
+
+    # Add temporal features
+    data_with_features = add_temporal_features(data, df)
+
+    # Save processed data
+    save_data(data_with_features)
+
+    # Copy and save adjacency matrix
+    save_graph()
+
+    # Save dataset description
+    save_description(data_with_features)
+
+if __name__ == '__main__':
+    main()
--- a/scripts/data_preparation/PEMS03/generate_adj_mx.py
+++ b/scripts/data_preparation/PEMS03/generate_adj_mx.py
@ -0,0 +1,84 @@
+import csv
+import os
+import pickle
+
+import numpy as np
+
+
+def get_adjacency_matrix(distance_df_filename: str, num_of_vertices: int, id_filename: str = None) -> tuple:
+    """Generate adjacency matrix.
+
+    Args:
+        distance_df_filename (str): path of the csv file contains edges information
+        num_of_vertices (int): number of vertices
+        id_filename (str, optional): id filename. Defaults to None.
+
+    Returns:
+        tuple: two adjacency matrix.
+            np.array: connectivity-based adjacency matrix A (A[i, j]=0 or A[i, j]=1)
+            np.array: distance-based adjacency matrix A
+    """
+
+    if "npy" in distance_df_filename:
+        adj_mx = np.load(distance_df_filename)
+        return adj_mx, None
+    else:
+        adjacency_matrix_connectivity = np.zeros((int(num_of_vertices), int(
+            num_of_vertices)), dtype=np.float32)
+        adjacency_matrix_distance = np.zeros((int(num_of_vertices), int(num_of_vertices)),
+                                             dtype=np.float32)
+        if id_filename:
+            # the id in the distance file does not start from 0, so it needs to be remapped
+            with open(id_filename, "r") as f:
+                id_dict = {int(i): idx for idx, i in enumerate(
+                    f.read().strip().split("\n"))}  # map node idx to 0-based index (start from 0)
+            with open(distance_df_filename, "r") as f:
+                f.readline()  # omit the first line
+                reader = csv.reader(f)
+                for row in reader:
+                    if len(row) != 3:
+                        continue
+                    i, j, distance = int(row[0]), int(row[1]), float(row[2])
+                    adjacency_matrix_connectivity[id_dict[i], id_dict[j]] = 1
+                    adjacency_matrix_connectivity[id_dict[j], id_dict[i]] = 1
+                    adjacency_matrix_distance[id_dict[i],
+                                              id_dict[j]] = distance
+                    adjacency_matrix_distance[id_dict[j],
+                                              id_dict[i]] = distance
+            return adjacency_matrix_connectivity, adjacency_matrix_distance
+        else:
+            # ids in distance file start from 0
+            with open(distance_df_filename, "r") as f:
+                f.readline()
+                reader = csv.reader(f)
+                for row in reader:
+                    if len(row) != 3:
+                        continue
+                    i, j, distance = int(row[0]), int(row[1]), float(row[2])
+                    adjacency_matrix_connectivity[i, j] = 1
+                    adjacency_matrix_connectivity[j, i] = 1
+                    adjacency_matrix_distance[i, j] = distance
+                    adjacency_matrix_distance[j, i] = distance
+            return adjacency_matrix_connectivity, adjacency_matrix_distance
+
+
+def generate_adj_pems03():
+    distance_df_filename, num_of_vertices = "datasets/raw_data/PEMS03/PEMS03.csv", 358
+    if os.path.exists(distance_df_filename.split(".", maxsplit=1)[0] + ".txt"):
+        id_filename = distance_df_filename.split(".", maxsplit=1)[0] + ".txt"
+    else:
+        id_filename = None
+    adj_mx, distance_mx = get_adjacency_matrix(
+        distance_df_filename, num_of_vertices, id_filename=id_filename)
+    # the self loop is missing
+    add_self_loop = False
+    if add_self_loop:
+        print("adding self loop to adjacency matrices.")
+        adj_mx = adj_mx + np.identity(adj_mx.shape[0])
+        distance_mx = distance_mx + np.identity(distance_mx.shape[0])
+    else:
+        print("kindly note that there is no self loop in adjacency matrices.")
+    with open("datasets/raw_data/PEMS03/adj_PEMS03.pkl", "wb") as f:
+        pickle.dump(adj_mx, f)
+    with open("datasets/raw_data/PEMS03/adj_PEMS03_distance.pkl", "wb") as f:
+        pickle.dump(distance_mx, f)
--- a/scripts/data_preparation/PEMS03/generate_training_data.py
+++ b/scripts/data_preparation/PEMS03/generate_training_data.py
@ -0,0 +1,113 @@
+import json
+import os
+import shutil
+
+import numpy as np
+from generate_adj_mx import generate_adj_pems03 as generate_adj
+
+# Hyperparameters
+dataset_name = 'PEMS03'
+data_file_path = f'datasets/raw_data/{dataset_name}/{dataset_name}.npz'
+graph_file_path = f'datasets/raw_data/{dataset_name}/adj_{dataset_name}.pkl'
+output_dir = f'datasets/{dataset_name}'
+target_channel = [0]  # Target traffic flow channel
+add_time_of_day = True  # Add time of day as a feature
+add_day_of_week = True  # Add day of the week as a feature
+steps_per_day = 288  # Number of time steps per day
+frequency = 1440 // steps_per_day
+domain = 'traffic flow'
+feature_description = [domain, 'time of day', 'day of week']
+regular_settings = {
+    'INPUT_LEN': 12,
+    'OUTPUT_LEN': 12,
+    'TRAIN_VAL_TEST_RATIO': [0.6, 0.2, 0.2],
+    'NORM_EACH_CHANNEL': False,
+    'RESCALE': True,
+    'METRICS': ['MAE', 'RMSE', 'MAPE'],
+    'NULL_VAL': 0.0
+}
+
+def load_and_preprocess_data():
+    '''Load and preprocess raw data, selecting the specified channel(s).'''
+    data = np.load(data_file_path)['data']
+    data = data[..., target_channel]
+    print(f'Raw time series shape: {data.shape}')
+    return data
+
+def add_temporal_features(data):
+    '''Add time of day and day of week as features to the data.'''
+    l, n, _ = data.shape
+    feature_list = [data]
+
+    if add_time_of_day:
+        time_of_day = np.array([i % steps_per_day / steps_per_day for i in range(l)])
+        time_of_day_tiled = np.tile(time_of_day, [1, n, 1]).transpose((2, 1, 0))
+        feature_list.append(time_of_day_tiled)
+
+    if add_day_of_week:
+        day_of_week = np.array([(i // steps_per_day) % 7 / 7 for i in range(l)])
+        day_of_week_tiled = np.tile(day_of_week, [1, n, 1]).transpose((2, 1, 0))
+        feature_list.append(day_of_week_tiled)
+
+    data_with_features = np.concatenate(feature_list, axis=-1)  # L x N x C
+    return data_with_features
+
+def save_data(data):
+    '''Save the preprocessed data to a binary file.'''
+    if not os.path.exists(output_dir):
+        os.makedirs(output_dir)
+    file_path = os.path.join(output_dir, 'data.dat')
+    fp = np.memmap(file_path, dtype='float32', mode='w+', shape=data.shape)
+    fp[:] = data[:]
+    fp.flush()
+    del fp
+    print(f'Data saved to {file_path}')
+
+def save_graph():
+    '''Save the adjacency matrix to the output directory, generating it if necessary.'''
+    output_graph_path = os.path.join(output_dir, 'adj_mx.pkl')
+    if os.path.exists(graph_file_path):
+        shutil.copyfile(graph_file_path, output_graph_path)
+    else:
+        generate_adj()
+        shutil.copyfile(graph_file_path, output_graph_path)
+    print(f'Adjacency matrix saved to {output_graph_path}')
+
+def save_description(data):
+    '''Save a description of the dataset to a JSON file.'''
+    description = {
+        'name': dataset_name,
+        'domain': domain,
+        'shape': data.shape,
+        'num_time_steps': data.shape[0],
+        'num_nodes': data.shape[1],
+        'num_features': data.shape[2],
+        'feature_description': feature_description,
+        'has_graph': graph_file_path is not None,
+        'frequency (minutes)': frequency,
+        'regular_settings': regular_settings
+    }
+    description_path = os.path.join(output_dir, 'desc.json')
+    with open(description_path, 'w') as f:
+        json.dump(description, f, indent=4)
+    print(f'Description saved to {description_path}')
+    print(description)
+
+def main():
+    # Load and preprocess data
+    data = load_and_preprocess_data()
+
+    # Add temporal features
+    data_with_features = add_temporal_features(data)
+
+    # Save processed data
+    save_data(data_with_features)
+
+    # Copy or generate and save adjacency matrix
+    save_graph()
+
+    # Save dataset description
+    save_description(data_with_features)
+
+if __name__ == '__main__':
+    main()
--- a/scripts/data_preparation/PEMS04/generate_adj_mx.py
+++ b/scripts/data_preparation/PEMS04/generate_adj_mx.py
@ -0,0 +1,84 @@
+import csv
+import os
+import pickle
+
+import numpy as np
+
+
+def get_adjacency_matrix(distance_df_filename: str, num_of_vertices: int, id_filename: str = None) -> tuple:
+    """Generate adjacency matrix.
+
+    Args:
+        distance_df_filename (str): path of the csv file contains edges information
+        num_of_vertices (int): number of vertices
+        id_filename (str, optional): id filename. Defaults to None.
+
+    Returns:
+        tuple: two adjacency matrix.
+            np.array: connectivity-based adjacency matrix A (A[i, j]=0 or A[i, j]=1)
+            np.array: distance-based adjacency matrix A
+    """
+
+    if "npy" in distance_df_filename:
+        adj_mx = np.load(distance_df_filename)
+        return adj_mx, None
+    else:
+        adjacency_matrix_connectivity = np.zeros((int(num_of_vertices), int(
+            num_of_vertices)), dtype=np.float32)
+        adjacency_matrix_distance = np.zeros((int(num_of_vertices), int(num_of_vertices)),
+                                             dtype=np.float32)
+        if id_filename:
+            # the id in the distance file does not start from 0, so it needs to be remapped
+            with open(id_filename, "r") as f:
+                id_dict = {int(i): idx for idx, i in enumerate(
+                    f.read().strip().split("\n"))}  # map node idx to 0-based index (start from 0)
+            with open(distance_df_filename, "r") as f:
+                f.readline()  # omit the first line
+                reader = csv.reader(f)
+                for row in reader:
+                    if len(row) != 3:
+                        continue
+                    i, j, distance = int(row[0]), int(row[1]), float(row[2])
+                    adjacency_matrix_connectivity[id_dict[i], id_dict[j]] = 1
+                    adjacency_matrix_connectivity[id_dict[j], id_dict[i]] = 1
+                    adjacency_matrix_distance[id_dict[i],
+                                              id_dict[j]] = distance
+                    adjacency_matrix_distance[id_dict[j],
+                                              id_dict[i]] = distance
+            return adjacency_matrix_connectivity, adjacency_matrix_distance
+        else:
+            # ids in distance file start from 0
+            with open(distance_df_filename, "r") as f:
+                f.readline()
+                reader = csv.reader(f)
+                for row in reader:
+                    if len(row) != 3:
+                        continue
+                    i, j, distance = int(row[0]), int(row[1]), float(row[2])
+                    adjacency_matrix_connectivity[i, j] = 1
+                    adjacency_matrix_connectivity[j, i] = 1
+                    adjacency_matrix_distance[i, j] = distance
+                    adjacency_matrix_distance[j, i] = distance
+            return adjacency_matrix_connectivity, adjacency_matrix_distance
+
+
+def generate_adj_pems04():
+    distance_df_filename, num_of_vertices = "datasets/raw_data/PEMS04/PEMS04.csv", 307
+    if os.path.exists(distance_df_filename.split(".", maxsplit=1)[0] + ".txt"):
+        id_filename = distance_df_filename.split(".", maxsplit=1)[0] + ".txt"
+    else:
+        id_filename = None
+    adj_mx, distance_mx = get_adjacency_matrix(
+        distance_df_filename, num_of_vertices, id_filename=id_filename)
+    # the self loop is missing
+    add_self_loop = False
+    if add_self_loop:
+        print("adding self loop to adjacency matrices.")
+        adj_mx = adj_mx + np.identity(adj_mx.shape[0])
+        distance_mx = distance_mx + np.identity(distance_mx.shape[0])
+    else:
+        print("kindly note that there is no self loop in adjacency matrices.")
+    with open("datasets/raw_data/PEMS04/adj_PEMS04.pkl", "wb") as f:
+        pickle.dump(adj_mx, f)
+    with open("datasets/raw_data/PEMS04/adj_PEMS04_distance.pkl", "wb") as f:
+        pickle.dump(distance_mx, f)
--- a/scripts/data_preparation/PEMS04/generate_training_data.py
+++ b/scripts/data_preparation/PEMS04/generate_training_data.py
@ -0,0 +1,113 @@
+import json
+import os
+import shutil
+
+import numpy as np
+from generate_adj_mx import generate_adj_pems04 as generate_adj
+
+# Hyperparameters
+dataset_name = 'PEMS04'
+data_file_path = f'datasets/raw_data/{dataset_name}/{dataset_name}.npz'
+graph_file_path = f'datasets/raw_data/{dataset_name}/adj_{dataset_name}.pkl'
+output_dir = f'datasets/{dataset_name}'
+target_channel = [0]  # Target traffic flow channel
+add_time_of_day = True  # Add time of day as a feature
+add_day_of_week = True  # Add day of the week as a feature
+steps_per_day = 288  # Number of time steps per day
+frequency = 1440 // steps_per_day
+domain = 'traffic flow'
+feature_description = [domain, 'time of day', 'day of week']
+regular_settings = {
+    'INPUT_LEN': 12,
+    'OUTPUT_LEN': 12,
+    'TRAIN_VAL_TEST_RATIO': [0.6, 0.2, 0.2],
+    'NORM_EACH_CHANNEL': False,
+    'RESCALE': True,
+    'METRICS': ['MAE', 'RMSE', 'MAPE'],
+    'NULL_VAL': 0.0
+}
+
+def load_and_preprocess_data():
+    '''Load and preprocess raw data, selecting the specified channel(s).'''
+    data = np.load(data_file_path)['data']
+    data = data[..., target_channel]
+    print(f'Raw time series shape: {data.shape}')
+    return data
+
+def add_temporal_features(data):
+    '''Add time of day and day of week as features to the data.'''
+    l, n, _ = data.shape
+    feature_list = [data]
+
+    if add_time_of_day:
+        time_of_day = np.array([i % steps_per_day / steps_per_day for i in range(l)])
+        time_of_day_tiled = np.tile(time_of_day, [1, n, 1]).transpose((2, 1, 0))
+        feature_list.append(time_of_day_tiled)
+
+    if add_day_of_week:
+        day_of_week = np.array([(i // steps_per_day) % 7 / 7 for i in range(l)])
+        day_of_week_tiled = np.tile(day_of_week, [1, n, 1]).transpose((2, 1, 0))
+        feature_list.append(day_of_week_tiled)
+
+    data_with_features = np.concatenate(feature_list, axis=-1)  # L x N x C
+    return data_with_features
+
+def save_data(data):
+    '''Save the preprocessed data to a binary file.'''
+    if not os.path.exists(output_dir):
+        os.makedirs(output_dir)
+    file_path = os.path.join(output_dir, 'data.dat')
+    fp = np.memmap(file_path, dtype='float32', mode='w+', shape=data.shape)
+    fp[:] = data[:]
+    fp.flush()
+    del fp
+    print(f'Data saved to {file_path}')
+
+def save_graph():
+    '''Save the adjacency matrix to the output directory, generating it if necessary.'''
+    output_graph_path = os.path.join(output_dir, 'adj_mx.pkl')
+    if os.path.exists(graph_file_path):
+        shutil.copyfile(graph_file_path, output_graph_path)
+    else:
+        generate_adj()
+        shutil.copyfile(graph_file_path, output_graph_path)
+    print(f'Adjacency matrix saved to {output_graph_path}')
+
+def save_description(data):
+    '''Save a description of the dataset to a JSON file.'''
+    description = {
+        'name': dataset_name,
+        'domain': domain,
+        'shape': data.shape,
+        'num_time_steps': data.shape[0],
+        'num_nodes': data.shape[1],
+        'num_features': data.shape[2],
+        'feature_description': feature_description,
+        'has_graph': graph_file_path is not None,
+        'frequency (minutes)': frequency,
+        'regular_settings': regular_settings
+    }
+    description_path = os.path.join(output_dir, 'desc.json')
+    with open(description_path, 'w') as f:
+        json.dump(description, f, indent=4)
+    print(f'Description saved to {description_path}')
+    print(description)
+
+def main():
+    # Load and preprocess data
+    data = load_and_preprocess_data()
+
+    # Add temporal features
+    data_with_features = add_temporal_features(data)
+
+    # Save processed data
+    save_data(data_with_features)
+
+    # Copy or generate and save adjacency matrix
+    save_graph()
+
+    # Save dataset description
+    save_description(data_with_features)
+
+if __name__ == '__main__':
+    main()
--- a/scripts/data_preparation/PEMS07/generate_adj_mx.py
+++ b/scripts/data_preparation/PEMS07/generate_adj_mx.py
@ -0,0 +1,84 @@
+import csv
+import os
+import pickle
+
+import numpy as np
+
+
+def get_adjacency_matrix(distance_df_filename: str, num_of_vertices: int, id_filename: str = None) -> tuple:
+    """Generate adjacency matrix.
+
+    Args:
+        distance_df_filename (str): path of the csv file contains edges information
+        num_of_vertices (int): number of vertices
+        id_filename (str, optional): id filename. Defaults to None.
+
+    Returns:
+        tuple: two adjacency matrix.
+            np.array: connectivity-based adjacency matrix A (A[i, j]=0 or A[i, j]=1)
+            np.array: distance-based adjacency matrix A
+    """
+
+    if "npy" in distance_df_filename:
+        adj_mx = np.load(distance_df_filename)
+        return adj_mx, None
+    else:
+        adjacency_matrix_connectivity = np.zeros((int(num_of_vertices), int(
+            num_of_vertices)), dtype=np.float32)
+        adjacency_matrix_distance = np.zeros((int(num_of_vertices), int(num_of_vertices)),
+                                             dtype=np.float32)
+        if id_filename:
+            # the id in the distance file does not start from 0, so it needs to be remapped
+            with open(id_filename, "r") as f:
+                id_dict = {int(i): idx for idx, i in enumerate(
+                    f.read().strip().split("\n"))}  # map node idx to 0-based index (start from 0)
+            with open(distance_df_filename, "r") as f:
+                f.readline()  # omit the first line
+                reader = csv.reader(f)
+                for row in reader:
+                    if len(row) != 3:
+                        continue
+                    i, j, distance = int(row[0]), int(row[1]), float(row[2])
+                    adjacency_matrix_connectivity[id_dict[i], id_dict[j]] = 1
+                    adjacency_matrix_connectivity[id_dict[j], id_dict[i]] = 1
+                    adjacency_matrix_distance[id_dict[i],
+                                              id_dict[j]] = distance
+                    adjacency_matrix_distance[id_dict[j],
+                                              id_dict[i]] = distance
+            return adjacency_matrix_connectivity, adjacency_matrix_distance
+        else:
+            # ids in distance file start from 0
+            with open(distance_df_filename, "r") as f:
+                f.readline()
+                reader = csv.reader(f)
+                for row in reader:
+                    if len(row) != 3:
+                        continue
+                    i, j, distance = int(row[0]), int(row[1]), float(row[2])
+                    adjacency_matrix_connectivity[i, j] = 1
+                    adjacency_matrix_connectivity[j, i] = 1
+                    adjacency_matrix_distance[i, j] = distance
+                    adjacency_matrix_distance[j, i] = distance
+            return adjacency_matrix_connectivity, adjacency_matrix_distance
+
+
+def generate_adj_pems07():
+    distance_df_filename, num_of_vertices = "datasets/raw_data/PEMS07/PEMS07.csv", 883
+    if os.path.exists(distance_df_filename.split(".", maxsplit=1)[0] + ".txt"):
+        id_filename = distance_df_filename.split(".", maxsplit=1)[0] + ".txt"
+    else:
+        id_filename = None
+    adj_mx, distance_mx = get_adjacency_matrix(
+        distance_df_filename, num_of_vertices, id_filename=id_filename)
+    # the self loop is missing
+    add_self_loop = False
+    if add_self_loop:
+        print("adding self loop to adjacency matrices.")
+        adj_mx = adj_mx + np.identity(adj_mx.shape[0])
+        distance_mx = distance_mx + np.identity(distance_mx.shape[0])
+    else:
+        print("kindly note that there is no self loop in adjacency matrices.")
+    with open("datasets/raw_data/PEMS07/adj_PEMS07.pkl", "wb") as f:
+        pickle.dump(adj_mx, f)
+    with open("datasets/raw_data/PEMS07/adj_PEMS07_distance.pkl", "wb") as f:
+        pickle.dump(distance_mx, f)
--- a/scripts/data_preparation/PEMS07/generate_training_data.py
+++ b/scripts/data_preparation/PEMS07/generate_training_data.py
@ -0,0 +1,113 @@
+import json
+import os
+import shutil
+
+import numpy as np
+from generate_adj_mx import generate_adj_pems07 as generate_adj
+
+# Hyperparameters
+dataset_name = 'PEMS07'
+data_file_path = f'datasets/raw_data/{dataset_name}/{dataset_name}.npz'
+graph_file_path = f'datasets/raw_data/{dataset_name}/adj_{dataset_name}.pkl'
+output_dir = f'datasets/{dataset_name}'
+target_channel = [0]  # Target traffic flow channel
+add_time_of_day = True  # Add time of day as a feature
+add_day_of_week = True  # Add day of the week as a feature
+steps_per_day = 288  # Number of time steps per day
+frequency = 1440 // steps_per_day
+domain = 'traffic flow'
+feature_description = [domain, 'time of day', 'day of week']
+regular_settings = {
+    'INPUT_LEN': 12,
+    'OUTPUT_LEN': 12,
+    'TRAIN_VAL_TEST_RATIO': [0.6, 0.2, 0.2],
+    'NORM_EACH_CHANNEL': False,
+    'RESCALE': True,
+    'METRICS': ['MAE', 'RMSE', 'MAPE'],
+    'NULL_VAL': 0.0
+}
+
+def load_and_preprocess_data():
+    '''Load and preprocess raw data, selecting the specified channel(s).'''
+    data = np.load(data_file_path)['data']
+    data = data[..., target_channel]
+    print(f'Raw time series shape: {data.shape}')
+    return data
+
+def add_temporal_features(data):
+    '''Add time of day and day of week as features to the data.'''
+    l, n, _ = data.shape
+    feature_list = [data]
+
+    if add_time_of_day:
+        time_of_day = np.array([i % steps_per_day / steps_per_day for i in range(l)])
+        time_of_day_tiled = np.tile(time_of_day, [1, n, 1]).transpose((2, 1, 0))
+        feature_list.append(time_of_day_tiled)
+
+    if add_day_of_week:
+        day_of_week = np.array([(i // steps_per_day) % 7 / 7 for i in range(l)])
+        day_of_week_tiled = np.tile(day_of_week, [1, n, 1]).transpose((2, 1, 0))
+        feature_list.append(day_of_week_tiled)
+
+    data_with_features = np.concatenate(feature_list, axis=-1)  # L x N x C
+    return data_with_features
+
+def save_data(data):
+    '''Save the preprocessed data to a binary file.'''
+    if not os.path.exists(output_dir):
+        os.makedirs(output_dir)
+    file_path = os.path.join(output_dir, 'data.dat')
+    fp = np.memmap(file_path, dtype='float32', mode='w+', shape=data.shape)
+    fp[:] = data[:]
+    fp.flush()
+    del fp
+    print(f'Data saved to {file_path}')
+
+def save_graph():
+    '''Save the adjacency matrix to the output directory, generating it if necessary.'''
+    output_graph_path = os.path.join(output_dir, 'adj_mx.pkl')
+    if os.path.exists(graph_file_path):
+        shutil.copyfile(graph_file_path, output_graph_path)
+    else:
+        generate_adj()
+        shutil.copyfile(graph_file_path, output_graph_path)
+    print(f'Adjacency matrix saved to {output_graph_path}')
+
+def save_description(data):
+    '''Save a description of the dataset to a JSON file.'''
+    description = {
+        'name': dataset_name,
+        'domain': domain,
+        'shape': data.shape,
+        'num_time_steps': data.shape[0],
+        'num_nodes': data.shape[1],
+        'num_features': data.shape[2],
+        'feature_description': feature_description,
+        'has_graph': graph_file_path is not None,
+        'frequency (minutes)': frequency,
+        'regular_settings': regular_settings
+    }
+    description_path = os.path.join(output_dir, 'desc.json')
+    with open(description_path, 'w') as f:
+        json.dump(description, f, indent=4)
+    print(f'Description saved to {description_path}')
+    print(description)
+
+def main():
+    # Load and preprocess data
+    data = load_and_preprocess_data()
+
+    # Add temporal features
+    data_with_features = add_temporal_features(data)
+
+    # Save processed data
+    save_data(data_with_features)
+
+    # Copy or generate and save adjacency matrix
+    save_graph()
+
+    # Save dataset description
+    save_description(data_with_features)
+
+if __name__ == '__main__':
+    main()
--- a/scripts/data_preparation/PEMS08/generate_adj_mx.py
+++ b/scripts/data_preparation/PEMS08/generate_adj_mx.py
@ -0,0 +1,84 @@
+import csv
+import os
+import pickle
+
+import numpy as np
+
+
+def get_adjacency_matrix(distance_df_filename: str, num_of_vertices: int, id_filename: str = None) -> tuple:
+    """Generate adjacency matrix.
+
+    Args:
+        distance_df_filename (str): path of the csv file contains edges information
+        num_of_vertices (int): number of vertices
+        id_filename (str, optional): id filename. Defaults to None.
+
+    Returns:
+        tuple: two adjacency matrix.
+            np.array: connectivity-based adjacency matrix A (A[i, j]=0 or A[i, j]=1)
+            np.array: distance-based adjacency matrix A
+    """
+
+    if "npy" in distance_df_filename:
+        adj_mx = np.load(distance_df_filename)
+        return adj_mx, None
+    else:
+        adjacency_matrix_connectivity = np.zeros((int(num_of_vertices), int(
+            num_of_vertices)), dtype=np.float32)
+        adjacency_matrix_distance = np.zeros((int(num_of_vertices), int(num_of_vertices)),
+                                             dtype=np.float32)
+        if id_filename:
+            # the id in the distance file does not start from 0, so it needs to be remapped
+            with open(id_filename, "r") as f:
+                id_dict = {int(i): idx for idx, i in enumerate(
+                    f.read().strip().split("\n"))}  # map node idx to 0-based index (start from 0)
+            with open(distance_df_filename, "r") as f:
+                f.readline()  # omit the first line
+                reader = csv.reader(f)
+                for row in reader:
+                    if len(row) != 3:
+                        continue
+                    i, j, distance = int(row[0]), int(row[1]), float(row[2])
+                    adjacency_matrix_connectivity[id_dict[i], id_dict[j]] = 1
+                    adjacency_matrix_connectivity[id_dict[j], id_dict[i]] = 1
+                    adjacency_matrix_distance[id_dict[i],
+                                              id_dict[j]] = distance
+                    adjacency_matrix_distance[id_dict[j],
+                                              id_dict[i]] = distance
+            return adjacency_matrix_connectivity, adjacency_matrix_distance
+        else:
+            # ids in distance file start from 0
+            with open(distance_df_filename, "r") as f:
+                f.readline()
+                reader = csv.reader(f)
+                for row in reader:
+                    if len(row) != 3:
+                        continue
+                    i, j, distance = int(row[0]), int(row[1]), float(row[2])
+                    adjacency_matrix_connectivity[i, j] = 1
+                    adjacency_matrix_connectivity[j, i] = 1
+                    adjacency_matrix_distance[i, j] = distance
+                    adjacency_matrix_distance[j, i] = distance
+            return adjacency_matrix_connectivity, adjacency_matrix_distance
+
+
+def generate_adj_pems08():
+    distance_df_filename, num_of_vertices = "datasets/raw_data/PEMS08/PEMS08.csv", 170
+    if os.path.exists(distance_df_filename.split(".", maxsplit=1)[0] + ".txt"):
+        id_filename = distance_df_filename.split(".", maxsplit=1)[0] + ".txt"
+    else:
+        id_filename = None
+    adj_mx, distance_mx = get_adjacency_matrix(
+        distance_df_filename, num_of_vertices, id_filename=id_filename)
+    # the self loop is missing
+    add_self_loop = False
+    if add_self_loop:
+        print("adding self loop to adjacency matrices.")
+        adj_mx = adj_mx + np.identity(adj_mx.shape[0])
+        distance_mx = distance_mx + np.identity(distance_mx.shape[0])
+    else:
+        print("kindly note that there is no self loop in adjacency matrices.")
+    with open("datasets/raw_data/PEMS08/adj_PEMS08.pkl", "wb") as f:
+        pickle.dump(adj_mx, f)
+    with open("datasets/raw_data/PEMS08/adj_PEMS08_distance.pkl", "wb") as f:
+        pickle.dump(distance_mx, f)
--- a/scripts/data_preparation/PEMS08/generate_training_data.py
+++ b/scripts/data_preparation/PEMS08/generate_training_data.py
@ -0,0 +1,113 @@
+import json
+import os
+import shutil
+
+import numpy as np
+from generate_adj_mx import generate_adj_pems08 as generate_adj
+
+# Hyperparameters
+dataset_name = 'PEMS08'
+data_file_path = f'datasets/raw_data/{dataset_name}/{dataset_name}.npz'
+graph_file_path = f'datasets/raw_data/{dataset_name}/adj_{dataset_name}.pkl'
+output_dir = f'datasets/{dataset_name}'
+target_channel = [0]  # Target traffic flow channel
+add_time_of_day = True  # Add time of day as a feature
+add_day_of_week = True  # Add day of the week as a feature
+steps_per_day = 288  # Number of time steps per day
+frequency = 1440 // steps_per_day
+domain = 'traffic flow'
+feature_description = [domain, 'time of day', 'day of week']
+regular_settings = {
+    'INPUT_LEN': 12,
+    'OUTPUT_LEN': 12,
+    'TRAIN_VAL_TEST_RATIO': [0.6, 0.2, 0.2],
+    'NORM_EACH_CHANNEL': False,
+    'RESCALE': True,
+    'METRICS': ['MAE', 'RMSE', 'MAPE'],
+    'NULL_VAL': 0.0
+}
+
+def load_and_preprocess_data():
+    '''Load and preprocess raw data, selecting the specified channel(s).'''
+    data = np.load(data_file_path)['data']
+    data = data[..., target_channel]
+    print(f'Raw time series shape: {data.shape}')
+    return data
+
+def add_temporal_features(data):
+    '''Add time of day and day of week as features to the data.'''
+    l, n, _ = data.shape
+    feature_list = [data]
+
+    if add_time_of_day:
+        time_of_day = np.array([i % steps_per_day / steps_per_day for i in range(l)])
+        time_of_day_tiled = np.tile(time_of_day, [1, n, 1]).transpose((2, 1, 0))
+        feature_list.append(time_of_day_tiled)
+
+    if add_day_of_week:
+        day_of_week = np.array([(i // steps_per_day) % 7 / 7 for i in range(l)])
+        day_of_week_tiled = np.tile(day_of_week, [1, n, 1]).transpose((2, 1, 0))
+        feature_list.append(day_of_week_tiled)
+
+    data_with_features = np.concatenate(feature_list, axis=-1)  # L x N x C
+    return data_with_features
+
+def save_data(data):
+    '''Save the preprocessed data to a binary file.'''
+    if not os.path.exists(output_dir):
+        os.makedirs(output_dir)
+    file_path = os.path.join(output_dir, 'data.dat')
+    fp = np.memmap(file_path, dtype='float32', mode='w+', shape=data.shape)
+    fp[:] = data[:]
+    fp.flush()
+    del fp
+    print(f'Data saved to {file_path}')
+
+def save_graph():
+    '''Save the adjacency matrix to the output directory, generating it if necessary.'''
+    output_graph_path = os.path.join(output_dir, 'adj_mx.pkl')
+    if os.path.exists(graph_file_path):
+        shutil.copyfile(graph_file_path, output_graph_path)
+    else:
+        generate_adj()
+        shutil.copyfile(graph_file_path, output_graph_path)
+    print(f'Adjacency matrix saved to {output_graph_path}')
+
+def save_description(data):
+    '''Save a description of the dataset to a JSON file.'''
+    description = {
+        'name': dataset_name,
+        'domain': domain,
+        'shape': data.shape,
+        'num_time_steps': data.shape[0],
+        'num_nodes': data.shape[1],
+        'num_features': data.shape[2],
+        'feature_description': feature_description,
+        'has_graph': graph_file_path is not None,
+        'frequency (minutes)': frequency,
+        'regular_settings': regular_settings
+    }
+    description_path = os.path.join(output_dir, 'desc.json')
+    with open(description_path, 'w') as f:
+        json.dump(description, f, indent=4)
+    print(f'Description saved to {description_path}')
+    print(description)
+
+def main():
+    # Load and preprocess data
+    data = load_and_preprocess_data()
+
+    # Add temporal features
+    data_with_features = add_temporal_features(data)
+
+    # Save processed data
+    save_data(data_with_features)
+
+    # Copy or generate and save adjacency matrix
+    save_graph()
+
+    # Save dataset description
+    save_description(data_with_features)
+
+if __name__ == '__main__':
+    main()
--- a/scripts/data_preparation/Pulse/generate_training_data.py
+++ b/scripts/data_preparation/Pulse/generate_training_data.py
@ -0,0 +1,74 @@
+import json
+import os
+
+import numpy as np
+
+# Hyperparameters
+dataset_name = 'Pulse'
+data_file_path = f'datasets/raw_data/{dataset_name}/{dataset_name}.npy'
+graph_file_path = None
+output_dir = f'datasets/{dataset_name}'
+target_channel = [0]  # Target traffic flow channel
+frequency = None
+domain = 'simulated pulse data'
+feature_description = [domain]
+regular_settings = {
+    'INPUT_LEN': 336,
+    'OUTPUT_LEN': 336,
+    'TRAIN_VAL_TEST_RATIO': [0.7, 0.1, 0.2],
+    'NORM_EACH_CHANNEL': False,
+    'RESCALE': True,
+    'METRICS': ['MAE', 'RMSE', 'MAPE'],
+    'NULL_VAL': np.nan
+}
+
+def load_and_preprocess_data():
+    '''Load and preprocess raw data, selecting the specified channel(s).'''
+    data = np.load(data_file_path)
+    data = data[..., target_channel]
+    print(f'Raw time series shape: {data.shape}')
+    return data
+
+def save_data(data):
+    '''Save the preprocessed data to a binary file.'''
+    if not os.path.exists(output_dir):
+        os.makedirs(output_dir)
+    file_path = os.path.join(output_dir, 'data.dat')
+    fp = np.memmap(file_path, dtype='float32', mode='w+', shape=data.shape)
+    fp[:] = data[:]
+    fp.flush()
+    del fp
+    print(f'Data saved to {file_path}')
+
+def save_description(data):
+    '''Save a description of the dataset to a JSON file.'''
+    description = {
+        'name': dataset_name,
+        'domain': domain,
+        'shape': data.shape,
+        'num_time_steps': data.shape[0],
+        'num_nodes': data.shape[1],
+        'num_features': data.shape[2],
+        'feature_description': feature_description,
+        'has_graph': graph_file_path is not None,
+        'frequency (minutes)': frequency,
+        'settings': regular_settings
+    }
+    description_path = os.path.join(output_dir, 'desc.json')
+    with open(description_path, 'w') as f:
+        json.dump(description, f, indent=4)
+    print(f'Description saved to {description_path}')
+    print(description)
+
+def main():
+    # Load and preprocess data
+    data = load_and_preprocess_data()
+
+    # Save processed data
+    save_data(data)
+
+    # Save dataset description
+    save_description(data)
+
+if __name__ == '__main__':
+    main()
--- a/scripts/data_preparation/Pulse/simulate_data.py
+++ b/scripts/data_preparation/Pulse/simulate_data.py
@ -0,0 +1,33 @@
+import os
+
+import numpy as np
+import torch
+
+PROJECT_DIR = os.path.abspath(__file__ + '/../../../..')
+os.chdir(PROJECT_DIR)
+
+
+# hyper parameterts
+duration = 20000  # time series length
+min_interval = 30  # minimum interval between two pulses
+max_interval = 30  # maximum interval between two pulses
+
+def generate_pulse_sequence():
+    x = np.arange(0, duration, 1)
+    y = np.zeros_like(x)
+
+    current_time = 0
+    while current_time < duration:
+        pulse_interval = np.random.uniform(min_interval, max_interval)
+        pulse_width = 1
+        y[int(current_time):int(current_time + pulse_width)] = 1
+        current_time += pulse_interval + pulse_width
+
+    return x, y
+
+# generate pulse sequence
+time_points, pulse_sequence = generate_pulse_sequence()
+
+# save pulse sequence
+data = torch.Tensor(pulse_sequence).unsqueeze(-1).unsqueeze(-1).numpy()
+np.save('datasets/raw_data/Pulse/Pulse.npy', data)
--- a/scripts/data_preparation/SD/generate_training_data.py
+++ b/scripts/data_preparation/SD/generate_training_data.py
@ -0,0 +1,136 @@
+import json
+import os
+import pickle
+import shutil
+
+import numpy as np
+import pandas as pd
+
+# Hyperparameters
+dataset_name = 'SD'
+data_file_path = f'datasets/raw_data/{dataset_name}/{dataset_name}.h5'
+graph_file_path = f'datasets/raw_data/{dataset_name}/adj_{dataset_name}.npy'
+meta_file_path = f'datasets/raw_data/{dataset_name}/meta_{dataset_name}.csv'
+output_dir = f'datasets/{dataset_name}'
+target_channel = [0]  # Target traffic flow channel
+add_time_of_day = True  # Add time of day as a feature
+add_day_of_week = True  # Add day of the week as a feature
+add_day_of_month = False  # Add day of the month as a feature
+add_day_of_year = False  # Add day of the year as a feature
+steps_per_day = 96  # Number of time steps per day
+frequency = 1440 // steps_per_day
+domain = 'traffic flow'
+feature_description = [domain, 'time of day', 'day of week']
+regular_settings = {
+    'INPUT_LEN': 12,
+    'OUTPUT_LEN': 12,
+    'TRAIN_VAL_TEST_RATIO': [0.6, 0.2, 0.2],
+    'NORM_EACH_CHANNEL': False,
+    'RESCALE': True,
+    'METRICS': ['MAE', 'RMSE', 'MAPE'],
+    'NULL_VAL': 0.0
+}
+
+def load_and_preprocess_data():
+    '''Load and preprocess raw data, selecting the specified channel(s).'''
+    df = pd.read_hdf(data_file_path)
+    data = np.expand_dims(df.values, axis=-1)
+    data = data[..., target_channel]
+    print(f'Raw time series shape: {data.shape}')
+    return data, df
+
+def add_temporal_features(data, df):
+    '''Add time of day and day of week as features to the data.'''
+    _, n, _ = data.shape
+    feature_list = [data]
+
+    if add_time_of_day:
+        time_of_day = (df.index.values - df.index.values.astype('datetime64[D]')) / np.timedelta64(1, 'D')
+        time_of_day_tiled = np.tile(time_of_day, [1, n, 1]).transpose((2, 1, 0))
+        feature_list.append(time_of_day_tiled)
+
+    if add_day_of_week:
+        day_of_week = df.index.dayofweek / 7
+        day_of_week_tiled = np.tile(day_of_week, [1, n, 1]).transpose((2, 1, 0))
+        feature_list.append(day_of_week_tiled)
+
+    if add_day_of_month:
+        # numerical day_of_month
+        day_of_month = (df.index.day - 1 ) / 31 # df.index.day starts from 1. We need to minus 1 to make it start from 0.
+        day_of_month_tiled = np.tile(day_of_month, [1, n, 1]).transpose((2, 1, 0))
+        feature_list.append(day_of_month_tiled)
+
+    if add_day_of_year:
+        # numerical day_of_year
+        day_of_year = (df.index.dayofyear - 1) / 366 # df.index.month starts from 1. We need to minus 1 to make it start from 0.
+        day_of_year_tiled = np.tile(day_of_year, [1, n, 1]).transpose((2, 1, 0))
+        feature_list.append(day_of_year_tiled)
+
+    data_with_features = np.concatenate(feature_list, axis=-1)  # L x N x C
+    return data_with_features
+
+def save_data(data):
+    '''Save the preprocessed data to a binary file.'''
+    if not os.path.exists(output_dir):
+        os.makedirs(output_dir)
+    file_path = os.path.join(output_dir, 'data.dat')
+    fp = np.memmap(file_path, dtype='float32', mode='w+', shape=data.shape)
+    fp[:] = data[:]
+    fp.flush()
+    del fp
+    print(f'Data saved to {file_path}')
+
+def save_graph():
+    '''Save the adjacency matrix to the output directory.'''
+    output_graph_path = os.path.join(output_dir, 'adj_mx.pkl')
+    adj_mx = np.load(graph_file_path)
+    with open(output_dir + '/adj_mx.pkl', 'wb') as f:
+        pickle.dump(adj_mx, f)
+    print(f'Adjacency matrix saved to {output_graph_path}')
+
+def save_meta_data():
+    '''Save the meta data to the output directory'''
+    output_meta_data_path = os.path.join(output_dir, 'meta.csv')
+    shutil.copyfile(meta_file_path, output_meta_data_path)
+
+def save_description(data):
+    '''Save a description of the dataset to a JSON file.'''
+    description = {
+        'name': dataset_name,
+        'domain': domain,
+        'shape': data.shape,
+        'num_time_steps': data.shape[0],
+        'num_nodes': data.shape[1],
+        'num_features': data.shape[2],
+        'feature_description': feature_description,
+        'has_graph': graph_file_path is not None,
+        'frequency (minutes)': frequency,
+        'regular_settings': regular_settings
+    }
+    description_path = os.path.join(output_dir, 'desc.json')
+    with open(description_path, 'w') as f:
+        json.dump(description, f, indent=4)
+    print(f'Description saved to {description_path}')
+    print(description)
+
+def main():
+    # Load and preprocess data
+    data, df = load_and_preprocess_data()
+
+    # Add temporal features
+    data_with_features = add_temporal_features(data, df)
+
+    # Save processed data
+    save_data(data_with_features)
+
+    # Copy and save adjacency matrix
+    save_graph()
+
+    # Copy and save meta data
+    save_meta_data()
+
+    # Save dataset description
+    save_description(data_with_features)
+
+if __name__ == '__main__':
+    main()
--- a/scripts/data_preparation/Traffic/generate_training_data.py
+++ b/scripts/data_preparation/Traffic/generate_training_data.py
@ -0,0 +1,120 @@
+import json
+import os
+
+import numpy as np
+import pandas as pd
+
+# Hyperparameters
+dataset_name = 'Traffic'
+data_file_path = f'datasets/raw_data/{dataset_name}/{dataset_name}.csv'
+graph_file_path = None
+output_dir = f'datasets/{dataset_name}'
+target_channel = [0]  # Target traffic flow channel
+add_time_of_day = True  # Add time of day as a feature
+add_day_of_week = True  # Add day of the week as a feature
+add_day_of_month = True  # Add day of the month as a feature
+add_day_of_year = True  # Add day of the year as a feature
+steps_per_day = 24  # Number of time steps per day
+frequency = 1440 // steps_per_day
+domain = 'road occupancy rates'
+feature_description = [domain, 'time of day', 'day of week', 'day of month', 'day of year']
+regular_settings = {
+    'INPUT_LEN': 336,
+    'OUTPUT_LEN': 336,
+    'TRAIN_VAL_TEST_RATIO': [0.7, 0.1, 0.2],
+    'NORM_EACH_CHANNEL': True,
+    'RESCALE': False,
+    'METRICS': ['MAE', 'MSE'],
+    'NULL_VAL': np.nan
+}
+
+def load_and_preprocess_data():
+    '''Load and preprocess raw data, selecting the specified channel(s).'''
+    df = pd.read_csv(data_file_path)
+    df_index = pd.to_datetime(df['date'].values, format='%Y-%m-%d %H:%M:%S').to_numpy()
+    df = df[df.columns[1:]]
+    df.index = df_index
+    data = np.expand_dims(df.values, axis=-1)
+    data = data[..., target_channel]
+    print(f'Raw time series shape: {data.shape}')
+    return data, df
+
+def add_temporal_features(data, df):
+    '''Add time of day and day of week as features to the data.'''
+    _, n, _ = data.shape
+    feature_list = [data]
+
+    if add_time_of_day:
+        # numerical time_of_day
+        tod = (
+            df.index.values - df.index.values.astype('datetime64[D]')) / np.timedelta64(1, 'D')
+        tod_tiled = np.tile(tod, [1, n, 1]).transpose((2, 1, 0))
+        feature_list.append(tod_tiled)
+
+    if add_day_of_week:
+        # numerical day_of_week
+        dow = df.index.dayofweek / 7
+        dow_tiled = np.tile(dow, [1, n, 1]).transpose((2, 1, 0))
+        feature_list.append(dow_tiled)
+
+    if add_day_of_month:
+        # numerical day_of_month
+        dom = (df.index.day - 1) / 31 # df.index.day starts from 1. We need to minus 1 to make it start from 0.
+        dom_tiled = np.tile(dom, [1, n, 1]).transpose((2, 1, 0))
+        feature_list.append(dom_tiled)
+
+    if add_day_of_year:
+        # numerical day_of_year
+        doy = (df.index.dayofyear - 1) / 366 # df.index.month starts from 1. We need to minus 1 to make it start from 0.
+        doy_tiled = np.tile(doy, [1, n, 1]).transpose((2, 1, 0))
+        feature_list.append(doy_tiled)
+
+    data_with_features = np.concatenate(feature_list, axis=-1)  # L x N x C
+    return data_with_features
+
+def save_data(data):
+    '''Save the preprocessed data to a binary file.'''
+    if not os.path.exists(output_dir):
+        os.makedirs(output_dir)
+    file_path = os.path.join(output_dir, 'data.dat')
+    fp = np.memmap(file_path, dtype='float32', mode='w+', shape=data.shape)
+    fp[:] = data[:]
+    fp.flush()
+    del fp
+    print(f'Data saved to {file_path}')
+
+def save_description(data):
+    '''Save a description of the dataset to a JSON file.'''
+    description = {
+        'name': dataset_name,
+        'domain': domain,
+        'shape': data.shape,
+        'num_time_steps': data.shape[0],
+        'num_nodes': data.shape[1],
+        'num_features': data.shape[2],
+        'feature_description': feature_description,
+        'has_graph': graph_file_path is not None,
+        'frequency (minutes)': frequency,
+        'regular_settings': regular_settings
+    }
+    description_path = os.path.join(output_dir, 'desc.json')
+    with open(description_path, 'w') as f:
+        json.dump(description, f, indent=4)
+    print(f'Description saved to {description_path}')
+    print(description)
+
+def main():
+    # Load and preprocess data
+    data, df = load_and_preprocess_data()
+
+    # Add temporal features
+    data_with_features = add_temporal_features(data, df)
+
+    # Save processed data
+    save_data(data_with_features)
+
+    # Save dataset description
+    save_description(data_with_features)
+
+if __name__ == '__main__':
+    main()
--- a/scripts/data_preparation/Weather/generate_training_data.py
+++ b/scripts/data_preparation/Weather/generate_training_data.py
@ -0,0 +1,120 @@
+import json
+import os
+
+import numpy as np
+import pandas as pd
+
+# Hyperparameters
+dataset_name = 'Weather'
+data_file_path = f'datasets/raw_data/{dataset_name}/{dataset_name}.csv'
+graph_file_path = None
+output_dir = f'datasets/{dataset_name}'
+target_channel = [0]  # Target traffic flow channel
+add_time_of_day = True  # Add time of day as a feature
+add_day_of_week = True  # Add day of the week as a feature
+add_day_of_month = True  # Add day of the month as a feature
+add_day_of_year = True  # Add day of the year as a feature
+steps_per_day = 144  # Number of time steps per day
+frequency = 1440 // steps_per_day
+domain = 'weather'
+feature_description = [domain, 'time of day', 'day of week', 'day of month', 'day of year']
+regular_settings = {
+    'INPUT_LEN': 336,
+    'OUTPUT_LEN': 336,
+    'TRAIN_VAL_TEST_RATIO': [0.7, 0.1, 0.2],
+    'NORM_EACH_CHANNEL': True,
+    'RESCALE': False,
+    'METRICS': ['MAE', 'MSE'],
+    'NULL_VAL': np.nan
+}
+
+def load_and_preprocess_data():
+    '''Load and preprocess raw data, selecting the specified channel(s).'''
+    df = pd.read_csv(data_file_path)
+    df_index = pd.to_datetime(df['date'].values, format='%Y-%m-%d %H:%M:%S').to_numpy()
+    df = df[df.columns[1:]]
+    df.index = df_index
+    data = np.expand_dims(df.values, axis=-1)
+    data = data[..., target_channel]
+    print(f'Raw time series shape: {data.shape}')
+    return data, df
+
+def add_temporal_features(data, df):
+    '''Add time of day and day of week as features to the data.'''
+    l, n, _ = data.shape
+    feature_list = [data]
+
+    if add_time_of_day:
+        # numerical time_of_day
+        tod = [i % steps_per_day / steps_per_day for i in range(l)]
+        tod = np.array(tod)
+        tod_tiled = np.tile(tod, [1, n, 1]).transpose((2, 1, 0))
+        feature_list.append(tod_tiled)
+
+    if add_day_of_week:
+        # numerical day_of_week
+        dow = df.index.dayofweek / 7
+        dow_tiled = np.tile(dow, [1, n, 1]).transpose((2, 1, 0))
+        feature_list.append(dow_tiled)
+
+    if add_day_of_month:
+        # numerical day_of_month
+        dom = (df.index.day - 1) / 31 # df.index.day starts from 1. We need to minus 1 to make it start from 0.
+        dom_tiled = np.tile(dom, [1, n, 1]).transpose((2, 1, 0))
+        feature_list.append(dom_tiled)
+
+    if add_day_of_year:
+        # numerical day_of_year
+        doy = (df.index.dayofyear - 1) / 366 # df.index.month starts from 1. We need to minus 1 to make it start from 0.
+        doy_tiled = np.tile(doy, [1, n, 1]).transpose((2, 1, 0))
+        feature_list.append(doy_tiled)
+
+    data_with_features = np.concatenate(feature_list, axis=-1)  # L x N x C
+    return data_with_features
+
+def save_data(data):
+    '''Save the preprocessed data to a binary file.'''
+    if not os.path.exists(output_dir):
+        os.makedirs(output_dir)
+    file_path = os.path.join(output_dir, 'data.dat')
+    fp = np.memmap(file_path, dtype='float32', mode='w+', shape=data.shape)
+    fp[:] = data[:]
+    fp.flush()
+    del fp
+    print(f'Data saved to {file_path}')
+
+def save_description(data):
+    '''Save a description of the dataset to a JSON file.'''
+    description = {
+        'name': dataset_name,
+        'domain': domain,
+        'shape': data.shape,
+        'num_time_steps': data.shape[0],
+        'num_nodes': data.shape[1],
+        'num_features': data.shape[2],
+        'feature_description': feature_description,
+        'has_graph': graph_file_path is not None,
+        'frequency (minutes)': frequency,
+        'regular_settings': regular_settings
+    }
+    description_path = os.path.join(output_dir, 'desc.json')
+    with open(description_path, 'w') as f:
+        json.dump(description, f, indent=4)
+    print(f'Description saved to {description_path}')
+    print(description)
+
+def main():
+    # Load and preprocess data
+    data, df = load_and_preprocess_data()
+
+    # Add temporal features
+    data_with_features = add_temporal_features(data, df)
+
+    # Save processed data
+    save_data(data_with_features)
+
+    # Save dataset description
+    save_description(data_with_features)
+
+if __name__ == '__main__':
+    main()
--- a/scripts/data_preparation/run.sh
+++ b/scripts/data_preparation/run.sh
@ -0,0 +1,30 @@
+#!/bin/bash
+# spatial-temporal forecasting
+python scripts/data_preparation/METR-LA/generate_training_data.py
+python scripts/data_preparation/PEMS-BAY/generate_training_data.py
+python scripts/data_preparation/PEMS03/generate_training_data.py
+python scripts/data_preparation/PEMS04/generate_training_data.py
+python scripts/data_preparation/PEMS07/generate_training_data.py
+python scripts/data_preparation/PEMS08/generate_training_data.py
+
+# long-term time series forecasting
+python scripts/data_preparation/ETTh1/generate_training_data.py
+python scripts/data_preparation/ETTh2/generate_training_data.py
+python scripts/data_preparation/ETTm1/generate_training_data.py
+python scripts/data_preparation/ETTm2/generate_training_data.py
+python scripts/data_preparation/Electricity/generate_training_data.py
+python scripts/data_preparation/Weather/generate_training_data.py
+python scripts/data_preparation/ExchangeRate/generate_training_data.py
+python scripts/data_preparation/Illness/generate_training_data.py
+python scripts/data_preparation/Traffic/generate_training_data.py
+
+# large-scale mts forecasting
+python scripts/data_preparation/CA/generate_training_data.py
+python scripts/data_preparation/GBA/generate_training_data.py
+python scripts/data_preparation/GLA/generate_training_data.py
+python scripts/data_preparation/SD/generate_training_data.py
+
+python scripts/data_preparation/BeijingAirQuality/generate_training_data.py
+
+python scripts/data_preparation/Gaussian/generate_training_data.py
+python scripts/data_preparation/Pulse/generate_training_data.py
--- a/scripts/data_visualization/data_visualization.ipynb
+++ b/scripts/data_visualization/data_visualization.ipynb
--- a/scripts/data_visualization/distribution_visualization.ipynb
+++ b/scripts/data_visualization/distribution_visualization.ipynb
--- a/scripts/data_visualization/indistinguishability.ipynb
+++ b/scripts/data_visualization/indistinguishability.ipynb
@ -0,0 +1,251 @@
+{
+ "cells": [
+  {
+   "cell_type": "code",
+   "execution_count": 1,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "import os\n",
+    "import sys\n",
+    "import math\n",
+    "import torch\n",
+    "\n",
+    "PROJECT_DIR = os.path.abspath(os.path.abspath('') + \"/../..\")\n",
+    "os.chdir(PROJECT_DIR)\n",
+    "\n",
+    "import numpy as np\n",
+    "from tqdm import tqdm\n",
+    "from basicts.data import TimeSeriesForecastingDataset\n",
+    "from basicts.utils import get_regular_settings\n",
+    "from basicts.scaler import ZScoreScaler\n",
+    "\n",
+    "\n",
+    "metric = \"cosine\" # metric used to calculate the similarity.\n",
+    "device = torch.device(\"cuda:0\" if torch.cuda.is_available() else \"cpu\")\n",
+    "\n",
+    "DATA_NAME = \"METR-LA\"\n",
+    "DATA_NAME = \"ETTh1\"\n",
+    "BATCH_SIZE = 8\n",
+    "regular_settings = get_regular_settings(DATA_NAME)\n",
+    "INPUT_LEN = regular_settings['INPUT_LEN']  # Length of input sequence\n",
+    "OUTPUT_LEN = regular_settings['OUTPUT_LEN']  # Length of output sequence\n",
+    "TRAIN_VAL_TEST_RATIO = regular_settings['TRAIN_VAL_TEST_RATIO']  # Train/Validation/Test split ratios\n",
+    "RESCALE = regular_settings['RESCALE'] # Whether to rescale the data\n",
+    "NULL_VAL = regular_settings['NULL_VAL'] # Null value in the data\n",
+    "NORM_EACH_CHANNEL = regular_settings['NORM_EACH_CHANNEL'] # Whether to normalize each channel\n"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "## utilities"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 2,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "# similarity computation\n",
+    "def cosine_similarity(x, y):\n",
+    "    # denominator\n",
+    "    l2_x = torch.norm(x, dim=2, p=2) + 1e-7\n",
+    "    l2_y = torch.norm(y, dim=2, p=2) + 1e-7\n",
+    "    l2_n = torch.matmul(l2_x.unsqueeze(dim=2), l2_y.unsqueeze(dim=2).transpose(1, 2))\n",
+    "    # numerator\n",
+    "    l2_d = torch.matmul(x, y.transpose(1, 2))\n",
+    "    return l2_d / l2_n\n",
+    "\n",
+    "def get_similarity_matrix(data, metric):\n",
+    "    if metric == \"cosine\":\n",
+    "        sim = cosine_similarity(data, data)\n",
+    "    elif metric == \"mse\":\n",
+    "        sim = torch.cdist(data, data, p=2)\n",
+    "    elif metric == \"mae\":\n",
+    "        sim = torch.cdist(data, data, p=1)\n",
+    "    else:\n",
+    "        raise NotImplementedError\n",
+    "    return sim"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 3,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "dataset_param = {\n",
+    "    'dataset_name': DATA_NAME,\n",
+    "    'train_val_test_ratio': TRAIN_VAL_TEST_RATIO,\n",
+    "    'input_len': INPUT_LEN,\n",
+    "    'output_len': OUTPUT_LEN,\n",
+    "}\n",
+    "# get dataloader\n",
+    "dataset = TimeSeriesForecastingDataset(**dataset_param, mode='train')\n",
+    "# the whole training data\n",
+    "dataloader = torch.utils.data.DataLoader(dataset, batch_size=BATCH_SIZE, shuffle=False, num_workers=3)\n",
+    "\n",
+    "scaler_param = {\n",
+    "    'dataset_name': DATA_NAME,\n",
+    "    'train_ratio': TRAIN_VAL_TEST_RATIO[0],\n",
+    "    'norm_each_channel': NORM_EACH_CHANNEL,\n",
+    "    'rescale': RESCALE,\n",
+    "}\n",
+    "scaler = ZScoreScaler(**scaler_param)\n"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "## Generate Similarity Matrix"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 4,
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stderr",
+     "output_type": "stream",
+     "text": [
+      "100%|██████████| 997/997 [00:02<00:00, 412.47it/s]\n"
+     ]
+    }
+   ],
+   "source": [
+    "# get similarity matrices\n",
+    "\n",
+    "# inference pipeline for a given dataloader\n",
+    "history_adjs_all = []\n",
+    "future_adjs_all = []\n",
+    "def inference(dataloader):\n",
+    "    for batch in tqdm(dataloader):\n",
+    "        future_data, history_data = batch['target'], batch['inputs']\n",
+    "        future_data = scaler.transform(future_data)\n",
+    "        history_data = scaler.transform(history_data)\n",
+    "        history_data = history_data[..., 0].transpose(1, 2) # batch_size, num_nodes, history_seq_len\n",
+    "        future_data = future_data[..., 0].transpose(1, 2) # batch_size, num_nodes, future_seq_len\n",
+    "        history_adjs = get_similarity_matrix(history_data, metric) # batch_size, num_nodes, num_nodes\n",
+    "        future_adjs = get_similarity_matrix(future_data, metric) # batch_size, num_nodes, num_nodes\n",
+    "        history_adjs_all.append(history_adjs)\n",
+    "        future_adjs_all.append(future_adjs)\n",
+    "# get similarity matrices\n",
+    "# for mode in [\"valid\"]:\n",
+    "for mode in [\"train\"]:\n",
+    "    inference(dataloader)\n"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 5,
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "torch.Size([7969, 7, 7])\n"
+     ]
+    }
+   ],
+   "source": [
+    "# get spatial indistinguishability ratio\n",
+    "history_similarity = torch.cat(history_adjs_all, dim=0).detach().cpu() # num_samples, num_modes, num_nodes\n",
+    "future_similarity = torch.cat(future_adjs_all, dim=0).detach().cpu() # num_samples, num_modes, num_nodes\n",
+    "L, N, N = future_similarity.shape\n",
+    "print(future_similarity.shape)"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "## Get Spatial Indistinguishability Ratio"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 6,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "e_u = 0.9\n",
+    "e_l = 0.4\n",
+    "\n",
+    "history_similarity_filtered = torch.where(history_similarity > e_u, torch.ones_like(history_similarity), torch.zeros_like(history_similarity))\n",
+    "future_similarity_filtered = torch.where(future_similarity < e_l, torch.ones_like(future_similarity), torch.zeros_like(future_similarity))\n",
+    "overlap = history_similarity_filtered * future_similarity_filtered\n"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 7,
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "tensor(3.8568)\n"
+     ]
+    }
+   ],
+   "source": [
+    "# overlap ratio\n",
+    "overlap_ratio = overlap.sum() / (L * N * N)\n",
+    "print(overlap_ratio * 1000)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 8,
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "tensor(15.7748)\n"
+     ]
+    }
+   ],
+   "source": [
+    "# indistinguishability ratio\n",
+    "indistinguishability_ratio = overlap.sum() / history_similarity_filtered.sum()\n",
+    "print(indistinguishability_ratio * 1000)\n"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": []
+  }
+ ],
+ "metadata": {
+  "kernelspec": {
+   "display_name": "BasicTS",
+   "language": "python",
+   "name": "python3"
+  },
+  "language_info": {
+   "codemirror_mode": {
+    "name": "ipython",
+    "version": 3
+   },
+   "file_extension": ".py",
+   "mimetype": "text/x-python",
+   "name": "python",
+   "nbconvert_exporter": "python",
+   "pygments_lexer": "ipython3",
+   "version": "3.11.11"
+  },
+  "orig_nbformat": 4
+ },
+ "nbformat": 4,
+ "nbformat_minor": 2
+}
--- a/scripts/data_visualization/pred_visualization.ipynb
+++ b/scripts/data_visualization/pred_visualization.ipynb
--- a/scripts/dataset_analysis.py
+++ b/scripts/dataset_analysis.py
@ -0,0 +1,487 @@
+#!/usr/bin/env python3
+# -*- coding: utf-8 -*-
+"""
+数据集分析脚本
+用于读取BasicTS项目中的数据集并生成详细的报告
+包括节点/边数量、时间频率、缺失值率、空间覆盖密度等分析
+"""
+
+import os
+import json
+import numpy as np
+import pandas as pd
+import pickle
+from pathlib import Path
+from typing import Dict, List, Tuple, Optional
+import matplotlib.pyplot as plt
+import seaborn as sns
+from datetime import datetime, timedelta
+import warnings
+warnings.filterwarnings('ignore')
+
+# 设置中文字体
+plt.rcParams['font.sans-serif'] = ['SimHei', 'DejaVu Sans']
+plt.rcParams['axes.unicode_minus'] = False
+
+class DatasetAnalyzer:
+    """数据集分析器"""
+    
+    def __init__(self, datasets_dir: str = "datasets"):
+        """
+        初始化数据集分析器
+        
+        Args:
+            datasets_dir: 数据集目录路径
+        """
+        self.datasets_dir = Path(datasets_dir)
+        self.datasets_info = {}
+        self.analysis_results = {}
+        
+    def get_available_datasets(self) -> List[str]:
+        """获取可用的数据集列表"""
+        datasets = []
+        for item in self.datasets_dir.iterdir():
+            if item.is_dir() and (item / "desc.json").exists():
+                datasets.append(item.name)
+        return sorted(datasets)
+    
+    def load_dataset_description(self, dataset_name: str) -> Dict:
+        """加载数据集描述文件"""
+        desc_path = self.datasets_dir / dataset_name / "desc.json"
+        with open(desc_path, 'r', encoding='utf-8') as f:
+            return json.load(f)
+    
+    def load_dataset_data(self, dataset_name: str) -> np.ndarray:
+        """加载数据集数据"""
+        desc = self.load_dataset_description(dataset_name)
+        data_path = self.datasets_dir / dataset_name / "data.dat"
+        
+        # 使用memmap加载大数据文件
+        data = np.memmap(data_path, dtype='float32', mode='r', 
+                        shape=tuple(desc['shape']))
+        return data.copy()  # 复制到内存中
+    
+    def load_adjacency_matrix(self, dataset_name: str) -> Optional[np.ndarray]:
+        """加载邻接矩阵（如果存在）"""
+        adj_path = self.datasets_dir / dataset_name / "adj_mx.pkl"
+        if adj_path.exists():
+            with open(adj_path, 'rb') as f:
+                adj_data = pickle.load(f)
+                # 处理不同的邻接矩阵格式
+                if isinstance(adj_data, tuple):
+                    return adj_data[0]  # 通常第一个元素是邻接矩阵
+                elif isinstance(adj_data, dict):
+                    return adj_data.get('adj_mx', adj_data.get('adj', None))
+                else:
+                    return adj_data
+        return None
+    
+    def analyze_missing_values(self, data: np.ndarray, null_val: float = 0.0) -> Dict:
+        """分析缺失值"""
+        # 计算缺失值
+        if np.isnan(null_val):
+            missing_mask = np.isnan(data)
+        else:
+            missing_mask = (data == null_val)
+        
+        total_elements = data.size
+        missing_elements = np.sum(missing_mask)
+        missing_rate = (missing_elements / total_elements) * 100
+        
+        # 按时间步分析缺失值
+        missing_by_time = np.sum(missing_mask, axis=(1, 2)) if data.ndim == 3 else np.sum(missing_mask, axis=1)
+        missing_by_node = np.sum(missing_mask, axis=(0, 2)) if data.ndim == 3 else np.sum(missing_mask, axis=0)
+        
+        return {
+            'total_missing_rate': missing_rate,
+            'missing_elements': missing_elements,
+            'total_elements': total_elements,
+            'missing_by_time': missing_by_time,
+            'missing_by_node': missing_by_node,
+            'max_missing_time': np.max(missing_by_time),
+            'max_missing_node': np.max(missing_by_node) if data.ndim == 3 else 0
+        }
+    
+    def analyze_temporal_continuity(self, data: np.ndarray, freq_minutes: int) -> Dict:
+        """分析时间连续性"""
+        # 计算时间跨度
+        total_time_steps = data.shape[0]
+        total_hours = (total_time_steps * freq_minutes) / 60
+        total_days = total_hours / 24
+        
+        # 计算数据密度（非零数据点比例）
+        non_zero_ratio = np.sum(data != 0) / data.size
+        
+        return {
+            'total_time_steps': total_time_steps,
+            'frequency_minutes': freq_minutes,
+            'total_hours': total_hours,
+            'total_days': total_days,
+            'data_density': non_zero_ratio
+        }
+    
+    def analyze_spatial_coverage(self, data: np.ndarray, adj_matrix: Optional[np.ndarray] = None) -> Dict:
+        """分析空间覆盖"""
+        if data.ndim == 3:
+            num_nodes = data.shape[1]
+            num_features = data.shape[2]
+        else:
+            num_nodes = data.shape[1]
+            num_features = 1
+        
+        # 计算邻接矩阵信息
+        edge_info = {}
+        if adj_matrix is not None:
+            num_edges = np.sum(adj_matrix > 0)
+            edge_density = num_edges / (num_nodes * num_nodes)
+            avg_degree = np.mean(np.sum(adj_matrix > 0, axis=1))
+            
+            edge_info = {
+                'num_edges': int(num_edges),
+                'edge_density': edge_density,
+                'avg_degree': avg_degree,
+                'max_degree': int(np.max(np.sum(adj_matrix > 0, axis=1))),
+                'min_degree': int(np.min(np.sum(adj_matrix > 0, axis=1)))
+            }
+        
+        return {
+            'num_nodes': num_nodes,
+            'num_features': num_features,
+            **edge_info
+        }
+    
+    def analyze_dataset(self, dataset_name: str) -> Dict:
+        """分析单个数据集"""
+        print(f"正在分析数据集: {dataset_name}")
+        
+        # 加载数据
+        desc = self.load_dataset_description(dataset_name)
+        data = self.load_dataset_data(dataset_name)
+        adj_matrix = self.load_adjacency_matrix(dataset_name)
+        
+        # 基础信息
+        basic_info = {
+            'name': desc['name'],
+            'domain': desc['domain'],
+            'shape': desc['shape'],
+            'has_graph': desc.get('has_graph', False),
+            'frequency_minutes': desc.get('frequency (minutes)', None)
+        }
+        
+        # 缺失值分析
+        null_val = desc.get('regular_settings', {}).get('NULL_VAL', 0.0)
+        missing_analysis = self.analyze_missing_values(data, null_val)
+        
+        # 时间连续性分析
+        temporal_analysis = self.analyze_temporal_continuity(data, basic_info['frequency_minutes'])
+        
+        # 空间覆盖分析
+        spatial_analysis = self.analyze_spatial_coverage(data, adj_matrix)
+        
+        return {
+            'basic_info': basic_info,
+            'missing_analysis': missing_analysis,
+            'temporal_analysis': temporal_analysis,
+            'spatial_analysis': spatial_analysis,
+            'description': desc
+        }
+    
+    def analyze_all_datasets(self) -> Dict:
+        """分析所有数据集"""
+        datasets = self.get_available_datasets()
+        print(f"发现 {len(datasets)} 个数据集: {datasets}")
+        
+        for dataset_name in datasets:
+            try:
+                self.analysis_results[dataset_name] = self.analyze_dataset(dataset_name)
+            except Exception as e:
+                print(f"分析数据集 {dataset_name} 时出错: {e}")
+                continue
+        
+        return self.analysis_results
+    
+    def generate_summary_report(self) -> str:
+        """生成汇总报告"""
+        if not self.analysis_results:
+            return "没有可用的分析结果"
+        
+        report = []
+        report.append("=" * 80)
+        report.append("BasicTS 数据集分析报告")
+        report.append("=" * 80)
+        report.append(f"生成时间: {datetime.now().strftime('%Y-%m-%d %H:%M:%S')}")
+        report.append(f"分析数据集数量: {len(self.analysis_results)}")
+        report.append("")
+        
+        # 数据集概览表
+        report.append("数据集概览:")
+        report.append("-" * 80)
+        report.append(f"{'数据集名称':<15} {'领域':<20} {'时间步数':<10} {'节点数':<8} {'特征数':<8} {'频率(分钟)':<12} {'缺失值率(%)':<12}")
+        report.append("-" * 80)
+        
+        for name, result in self.analysis_results.items():
+            basic = result['basic_info']
+            missing = result['missing_analysis']
+            spatial = result['spatial_analysis']
+            
+            report.append(f"{name:<15} {basic['domain']:<20} {basic['shape'][0]:<10} "
+                         f"{spatial['num_nodes']:<8} {spatial['num_features']:<8} "
+                         f"{basic['frequency_minutes']:<12} {missing['total_missing_rate']:<12.3f}")
+        
+        report.append("")
+        
+        # 详细分析
+        for name, result in self.analysis_results.items():
+            report.append(f"数据集: {name}")
+            report.append("-" * 40)
+            
+            basic = result['basic_info']
+            missing = result['missing_analysis']
+            temporal = result['temporal_analysis']
+            spatial = result['spatial_analysis']
+            
+            report.append(f"领域: {basic['domain']}")
+            report.append(f"数据形状: {basic['shape']}")
+            report.append(f"时间频率: {basic['frequency_minutes']} 分钟")
+            report.append(f"时间跨度: {temporal['total_days']:.1f} 天 ({temporal['total_hours']:.1f} 小时)")
+            report.append(f"节点数量: {spatial['num_nodes']}")
+            report.append(f"特征数量: {spatial['num_features']}")
+            
+            if spatial.get('num_edges'):
+                report.append(f"边数量: {spatial['num_edges']}")
+                report.append(f"边密度: {spatial['edge_density']:.4f}")
+                report.append(f"平均度数: {spatial['avg_degree']:.2f}")
+            
+            report.append(f"缺失值率: {missing['total_missing_rate']:.3f}%")
+            report.append(f"数据密度: {temporal['data_density']:.3f}")
+            report.append("")
+        
+        return "\n".join(report)
+    
+    def generate_comparative_analysis(self) -> str:
+        """生成对比分析报告"""
+        if not self.analysis_results:
+            return "没有可用的分析结果"
+        
+        report = []
+        report.append("=" * 80)
+        report.append("数据集对比分析")
+        report.append("=" * 80)
+        report.append("")
+        
+        # 按领域分组
+        domains = {}
+        for name, result in self.analysis_results.items():
+            domain = result['basic_info']['domain']
+            if domain not in domains:
+                domains[domain] = []
+            domains[domain].append((name, result))
+        
+        for domain, datasets in domains.items():
+            report.append(f"领域: {domain}")
+            report.append("-" * 40)
+            
+            # 该领域的数据集统计
+            missing_rates = [d[1]['missing_analysis']['total_missing_rate'] for d in datasets]
+            node_counts = [d[1]['spatial_analysis']['num_nodes'] for d in datasets]
+            time_steps = [d[1]['basic_info']['shape'][0] for d in datasets]
+            
+            report.append(f"数据集数量: {len(datasets)}")
+            report.append(f"平均缺失值率: {np.mean(missing_rates):.3f}%")
+            report.append(f"缺失值率范围: {min(missing_rates):.3f}% - {max(missing_rates):.3f}%")
+            report.append(f"平均节点数: {np.mean(node_counts):.1f}")
+            report.append(f"节点数范围: {min(node_counts)} - {max(node_counts)}")
+            report.append(f"平均时间步数: {np.mean(time_steps):.0f}")
+            report.append("")
+        
+        # 空间覆盖密度分析
+        report.append("空间覆盖密度分析:")
+        report.append("-" * 40)
+        
+        spatial_datasets = [(name, result) for name, result in self.analysis_results.items() 
+                           if result['spatial_analysis'].get('num_edges')]
+        
+        if spatial_datasets:
+            for name, result in spatial_datasets:
+                spatial = result['spatial_analysis']
+                report.append(f"{name}: {spatial['num_nodes']} 个节点, {spatial['num_edges']} 条边, "
+                             f"密度 {spatial['edge_density']:.4f}, 平均度数 {spatial['avg_degree']:.2f}")
+        else:
+            report.append("没有发现包含图结构的数据集")
+        
+        report.append("")
+        
+        # 时间连续性分析
+        report.append("时间连续性分析:")
+        report.append("-" * 40)
+        
+        temporal_data = []
+        for name, result in self.analysis_results.items():
+            temporal = result['temporal_analysis']
+            temporal_data.append({
+                'name': name,
+                'days': temporal['total_days'],
+                'density': temporal['data_density'],
+                'frequency': temporal['frequency_minutes']
+            })
+        
+        # 按时间跨度排序
+        temporal_data.sort(key=lambda x: x['days'], reverse=True)
+        
+        for data in temporal_data:
+            report.append(f"{data['name']}: {data['days']:.1f} 天, "
+                         f"数据密度 {data['density']:.3f}, "
+                         f"频率 {data['frequency']} 分钟")
+        
+        return "\n".join(report)
+    
+    def save_reports(self, output_dir: str = "analysis_reports"):
+        """保存分析报告"""
+        output_path = Path(output_dir)
+        output_path.mkdir(exist_ok=True)
+        
+        # 保存汇总报告
+        summary_report = self.generate_summary_report()
+        with open(output_path / "summary_report.txt", 'w', encoding='utf-8') as f:
+            f.write(summary_report)
+        
+        # 保存对比分析报告
+        comparative_report = self.generate_comparative_analysis()
+        with open(output_path / "comparative_analysis.txt", 'w', encoding='utf-8') as f:
+            f.write(comparative_report)
+        
+        # 保存详细JSON报告
+        with open(output_path / "detailed_analysis.json", 'w', encoding='utf-8') as f:
+            json.dump(self.analysis_results, f, indent=2, ensure_ascii=False, default=str)
+        
+        print(f"报告已保存到目录: {output_path}")
+    
+    def create_visualizations(self, output_dir: str = "analysis_reports"):
+        """创建可视化图表"""
+        if not self.analysis_results:
+            print("没有可用的分析结果")
+            return
+        
+        output_path = Path(output_dir)
+        output_path.mkdir(exist_ok=True)
+        
+        # 设置图表样式
+        plt.style.use('seaborn-v0_8')
+        
+        # 1. 缺失值率对比
+        fig, ax = plt.subplots(figsize=(12, 6))
+        names = list(self.analysis_results.keys())
+        missing_rates = [self.analysis_results[name]['missing_analysis']['total_missing_rate'] 
+                        for name in names]
+        
+        bars = ax.bar(names, missing_rates, color='skyblue', alpha=0.7)
+        ax.set_title('各数据集缺失值率对比', fontsize=14, fontweight='bold')
+        ax.set_xlabel('数据集名称')
+        ax.set_ylabel('缺失值率 (%)')
+        ax.tick_params(axis='x', rotation=45)
+        
+        # 添加数值标签
+        for bar, rate in zip(bars, missing_rates):
+            ax.text(bar.get_x() + bar.get_width()/2, bar.get_height() + 0.1,
+                   f'{rate:.2f}%', ha='center', va='bottom')
+        
+        plt.tight_layout()
+        plt.savefig(output_path / "missing_rates_comparison.png", dpi=300, bbox_inches='tight')
+        plt.close()
+        
+        # 2. 节点数量对比
+        fig, ax = plt.subplots(figsize=(12, 6))
+        node_counts = [self.analysis_results[name]['spatial_analysis']['num_nodes'] 
+                      for name in names]
+        
+        bars = ax.bar(names, node_counts, color='lightgreen', alpha=0.7)
+        ax.set_title('各数据集节点数量对比', fontsize=14, fontweight='bold')
+        ax.set_xlabel('数据集名称')
+        ax.set_ylabel('节点数量')
+        ax.tick_params(axis='x', rotation=45)
+        
+        # 添加数值标签
+        for bar, count in zip(bars, node_counts):
+            ax.text(bar.get_x() + bar.get_width()/2, bar.get_height() + max(node_counts)*0.01,
+                   f'{count}', ha='center', va='bottom')
+        
+        plt.tight_layout()
+        plt.savefig(output_path / "node_counts_comparison.png", dpi=300, bbox_inches='tight')
+        plt.close()
+        
+        # 3. 时间跨度对比
+        fig, ax = plt.subplots(figsize=(12, 6))
+        time_days = [self.analysis_results[name]['temporal_analysis']['total_days'] 
+                    for name in names]
+        
+        bars = ax.bar(names, time_days, color='orange', alpha=0.7)
+        ax.set_title('各数据集时间跨度对比', fontsize=14, fontweight='bold')
+        ax.set_xlabel('数据集名称')
+        ax.set_ylabel('时间跨度 (天)')
+        ax.tick_params(axis='x', rotation=45)
+        
+        # 添加数值标签
+        for bar, days in zip(bars, time_days):
+            ax.text(bar.get_x() + bar.get_width()/2, bar.get_height() + max(time_days)*0.01,
+                   f'{days:.1f}', ha='center', va='bottom')
+        
+        plt.tight_layout()
+        plt.savefig(output_path / "time_span_comparison.png", dpi=300, bbox_inches='tight')
+        plt.close()
+        
+        # 4. 散点图：节点数 vs 缺失值率
+        fig, ax = plt.subplots(figsize=(10, 6))
+        ax.scatter(node_counts, missing_rates, s=100, alpha=0.7, c='red')
+        
+        # 添加数据集标签
+        for i, name in enumerate(names):
+            ax.annotate(name, (node_counts[i], missing_rates[i]), 
+                       xytext=(5, 5), textcoords='offset points', fontsize=8)
+        
+        ax.set_xlabel('节点数量')
+        ax.set_ylabel('缺失值率 (%)')
+        ax.set_title('节点数量与缺失值率关系', fontsize=14, fontweight='bold')
+        ax.grid(True, alpha=0.3)
+        
+        plt.tight_layout()
+        plt.savefig(output_path / "nodes_vs_missing_rates.png", dpi=300, bbox_inches='tight')
+        plt.close()
+        
+        print(f"可视化图表已保存到目录: {output_path}")
+
+
+def main():
+    """主函数"""
+    print("BasicTS 数据集分析工具")
+    print("=" * 50)
+    
+    # 创建分析器
+    analyzer = DatasetAnalyzer()
+    
+    # 分析所有数据集
+    analyzer.analyze_all_datasets()
+    
+    # 生成并打印报告
+    print("\n" + "=" * 80)
+    print("数据集分析报告")
+    print("=" * 80)
+    
+    summary_report = analyzer.generate_summary_report()
+    print(summary_report)
+    
+    print("\n" + "=" * 80)
+    print("对比分析报告")
+    print("=" * 80)
+    
+    comparative_report = analyzer.generate_comparative_analysis()
+    print(comparative_report)
+    
+    # 保存报告和可视化
+    analyzer.save_reports()
+    analyzer.create_visualizations()
+    
+    print("\n分析完成！")
+
+
+if __name__ == "__main__":
+    main()
--- a/scripts/dataset_analyzer.py
+++ b/scripts/dataset_analyzer.py
@ -0,0 +1,194 @@
+#!/usr/bin/env python3
+# -*- coding: utf-8 -*-
+"""
+数据集分析器
+用于读取BasicTS项目中的数据集并生成详细的报告
+"""
+
+import os
+import json
+import numpy as np
+import pandas as pd
+import pickle
+from pathlib import Path
+from typing import Dict, List, Tuple, Optional
+from datetime import datetime
+import warnings
+warnings.filterwarnings('ignore')
+
+class DatasetAnalyzer:
+    """数据集分析器"""
+    
+    def __init__(self, datasets_dir: str = "datasets"):
+        """
+        初始化数据集分析器
+        
+        Args:
+            datasets_dir: 数据集目录路径
+        """
+        self.datasets_dir = Path(datasets_dir)
+        self.datasets_info = {}
+        self.analysis_results = {}
+        
+    def get_available_datasets(self) -> List[str]:
+        """获取可用的数据集列表"""
+        datasets = []
+        for item in self.datasets_dir.iterdir():
+            if item.is_dir() and (item / "desc.json").exists():
+                datasets.append(item.name)
+        return sorted(datasets)
+    
+    def load_dataset_description(self, dataset_name: str) -> Dict:
+        """加载数据集描述文件"""
+        desc_path = self.datasets_dir / dataset_name / "desc.json"
+        with open(desc_path, 'r', encoding='utf-8') as f:
+            return json.load(f)
+    
+    def load_dataset_data(self, dataset_name: str) -> np.ndarray:
+        """加载数据集数据"""
+        desc = self.load_dataset_description(dataset_name)
+        data_path = self.datasets_dir / dataset_name / "data.dat"
+        
+        # 使用memmap加载大数据文件
+        data = np.memmap(data_path, dtype='float32', mode='r', 
+                        shape=tuple(desc['shape']))
+        return data.copy()  # 复制到内存中
+    
+    def load_adjacency_matrix(self, dataset_name: str) -> Optional[np.ndarray]:
+        """加载邻接矩阵（如果存在）"""
+        adj_path = self.datasets_dir / dataset_name / "adj_mx.pkl"
+        if adj_path.exists():
+            with open(adj_path, 'rb') as f:
+                adj_data = pickle.load(f)
+                # 处理不同的邻接矩阵格式
+                if isinstance(adj_data, tuple):
+                    return adj_data[0]  # 通常第一个元素是邻接矩阵
+                elif isinstance(adj_data, dict):
+                    return adj_data.get('adj_mx', adj_data.get('adj', None))
+                else:
+                    return adj_data
+        return None
+    
+    def analyze_missing_values(self, data: np.ndarray, null_val: float = 0.0) -> Dict:
+        """分析缺失值"""
+        # 计算缺失值
+        if np.isnan(null_val):
+            missing_mask = np.isnan(data)
+        else:
+            missing_mask = (data == null_val)
+        
+        total_elements = data.size
+        missing_elements = np.sum(missing_mask)
+        missing_rate = (missing_elements / total_elements) * 100
+        
+        # 按时间步分析缺失值
+        missing_by_time = np.sum(missing_mask, axis=(1, 2)) if data.ndim == 3 else np.sum(missing_mask, axis=1)
+        missing_by_node = np.sum(missing_mask, axis=(0, 2)) if data.ndim == 3 else np.sum(missing_mask, axis=0)
+        
+        return {
+            'total_missing_rate': missing_rate,
+            'missing_elements': missing_elements,
+            'total_elements': total_elements,
+            'missing_by_time': missing_by_time,
+            'missing_by_node': missing_by_node,
+            'max_missing_time': np.max(missing_by_time),
+            'max_missing_node': np.max(missing_by_node) if data.ndim == 3 else 0
+        }
+    
+    def analyze_temporal_continuity(self, data: np.ndarray, freq_minutes: int) -> Dict:
+        """分析时间连续性"""
+        # 计算时间跨度
+        total_time_steps = data.shape[0]
+        total_hours = (total_time_steps * freq_minutes) / 60
+        total_days = total_hours / 24
+        
+        # 计算数据密度（非零数据点比例）
+        non_zero_ratio = np.sum(data != 0) / data.size
+        
+        return {
+            'total_time_steps': total_time_steps,
+            'frequency_minutes': freq_minutes,
+            'total_hours': total_hours,
+            'total_days': total_days,
+            'data_density': non_zero_ratio
+        }
+    
+    def analyze_spatial_coverage(self, data: np.ndarray, adj_matrix: Optional[np.ndarray] = None) -> Dict:
+        """分析空间覆盖"""
+        if data.ndim == 3:
+            num_nodes = data.shape[1]
+            num_features = data.shape[2]
+        else:
+            num_nodes = data.shape[1]
+            num_features = 1
+        
+        # 计算邻接矩阵信息
+        edge_info = {}
+        if adj_matrix is not None:
+            num_edges = np.sum(adj_matrix > 0)
+            edge_density = num_edges / (num_nodes * num_nodes)
+            avg_degree = np.mean(np.sum(adj_matrix > 0, axis=1))
+            
+            edge_info = {
+                'num_edges': int(num_edges),
+                'edge_density': edge_density,
+                'avg_degree': avg_degree,
+                'max_degree': int(np.max(np.sum(adj_matrix > 0, axis=1))),
+                'min_degree': int(np.min(np.sum(adj_matrix > 0, axis=1)))
+            }
+        
+        return {
+            'num_nodes': num_nodes,
+            'num_features': num_features,
+            **edge_info
+        }
+    
+    def analyze_dataset(self, dataset_name: str) -> Dict:
+        """分析单个数据集"""
+        print(f"正在分析数据集: {dataset_name}")
+        
+        # 加载数据
+        desc = self.load_dataset_description(dataset_name)
+        data = self.load_dataset_data(dataset_name)
+        adj_matrix = self.load_adjacency_matrix(dataset_name)
+        
+        # 基础信息
+        basic_info = {
+            'name': desc['name'],
+            'domain': desc['domain'],
+            'shape': desc['shape'],
+            'has_graph': desc.get('has_graph', False),
+            'frequency_minutes': desc.get('frequency (minutes)', None)
+        }
+        
+        # 缺失值分析
+        null_val = desc.get('regular_settings', {}).get('NULL_VAL', 0.0)
+        missing_analysis = self.analyze_missing_values(data, null_val)
+        
+        # 时间连续性分析
+        temporal_analysis = self.analyze_temporal_continuity(data, basic_info['frequency_minutes'])
+        
+        # 空间覆盖分析
+        spatial_analysis = self.analyze_spatial_coverage(data, adj_matrix)
+        
+        return {
+            'basic_info': basic_info,
+            'missing_analysis': missing_analysis,
+            'temporal_analysis': temporal_analysis,
+            'spatial_analysis': spatial_analysis,
+            'description': desc
+        }
+    
+    def analyze_all_datasets(self) -> Dict:
+        """分析所有数据集"""
+        datasets = self.get_available_datasets()
+        print(f"发现 {len(datasets)} 个数据集: {datasets}")
+        
+        for dataset_name in datasets:
+            try:
+                self.analysis_results[dataset_name] = self.analyze_dataset(dataset_name)
+            except Exception as e:
+                print(f"分析数据集 {dataset_name} 时出错: {e}")
+                continue
+        
+        return self.analysis_results
--- a/scripts/download_gpt2_with_kagglehub.py
+++ b/scripts/download_gpt2_with_kagglehub.py
@ -0,0 +1,34 @@
+import os
+import sys
+import argparse
+
+
+def main():
+    parser = argparse.ArgumentParser(description="Download GPT-2 via kagglehub to target directory")
+    parser.add_argument("--target", type=str, default="/home/azureuser/code/REPST/GPT-2", help="Target directory to store GPT-2")
+    args = parser.parse_args()
+
+    try:
+        import kagglehub
+    except Exception as e:
+        print("[ERROR] kagglehub 未安装或导入失败。请先运行: pip install kagglehub")
+        print("       需在 ~/.kaggle/kaggle.json 配置 Kaggle API。")
+        sys.exit(1)
+
+    os.makedirs(args.target, exist_ok=True)
+
+    handle = "openai/gpt-2"
+    print(f"开始通过 kagglehub 下载 {handle} 到 {args.target} ...")
+    try:
+        path = kagglehub.model_download(handle, path=args.target)
+    except Exception as e:
+        print(f"[ERROR] 下载失败: {e}")
+        sys.exit(2)
+
+    print(f"下载完成，已保存到: {path}")
+
+
+if __name__ == "__main__":
+    main()
+
+
--- a/scripts/report_generator.py
+++ b/scripts/report_generator.py
@ -0,0 +1,279 @@
+#!/usr/bin/env python3
+# -*- coding: utf-8 -*-
+"""
+报告生成器
+用于生成数据集分析的详细报告
+"""
+
+import json
+import numpy as np
+from pathlib import Path
+from datetime import datetime
+from typing import Dict, List
+import matplotlib.pyplot as plt
+import seaborn as sns
+
+# 设置中文字体
+plt.rcParams['font.sans-serif'] = ['SimHei', 'DejaVu Sans']
+plt.rcParams['axes.unicode_minus'] = False
+
+class ReportGenerator:
+    """报告生成器"""
+    
+    def __init__(self, analysis_results: Dict):
+        """
+        初始化报告生成器
+        
+        Args:
+            analysis_results: 数据集分析结果
+        """
+        self.analysis_results = analysis_results
+    
+    def generate_summary_report(self) -> str:
+        """生成汇总报告"""
+        if not self.analysis_results:
+            return "没有可用的分析结果"
+        
+        report = []
+        report.append("=" * 80)
+        report.append("BasicTS 数据集分析报告")
+        report.append("=" * 80)
+        report.append(f"生成时间: {datetime.now().strftime('%Y-%m-%d %H:%M:%S')}")
+        report.append(f"分析数据集数量: {len(self.analysis_results)}")
+        report.append("")
+        
+        # 数据集概览表
+        report.append("数据集概览:")
+        report.append("-" * 80)
+        report.append(f"{'数据集名称':<15} {'领域':<20} {'时间步数':<10} {'节点数':<8} {'特征数':<8} {'频率(分钟)':<12} {'缺失值率(%)':<12}")
+        report.append("-" * 80)
+        
+        for name, result in self.analysis_results.items():
+            basic = result['basic_info']
+            missing = result['missing_analysis']
+            spatial = result['spatial_analysis']
+            
+            report.append(f"{name:<15} {basic['domain']:<20} {basic['shape'][0]:<10} "
+                         f"{spatial['num_nodes']:<8} {spatial['num_features']:<8} "
+                         f"{basic['frequency_minutes']:<12} {missing['total_missing_rate']:<12.3f}")
+        
+        report.append("")
+        
+        # 详细分析
+        for name, result in self.analysis_results.items():
+            report.append(f"数据集: {name}")
+            report.append("-" * 40)
+            
+            basic = result['basic_info']
+            missing = result['missing_analysis']
+            temporal = result['temporal_analysis']
+            spatial = result['spatial_analysis']
+            
+            report.append(f"领域: {basic['domain']}")
+            report.append(f"数据形状: {basic['shape']}")
+            report.append(f"时间频率: {basic['frequency_minutes']} 分钟")
+            report.append(f"时间跨度: {temporal['total_days']:.1f} 天 ({temporal['total_hours']:.1f} 小时)")
+            report.append(f"节点数量: {spatial['num_nodes']}")
+            report.append(f"特征数量: {spatial['num_features']}")
+            
+            if spatial.get('num_edges'):
+                report.append(f"边数量: {spatial['num_edges']}")
+                report.append(f"边密度: {spatial['edge_density']:.4f}")
+                report.append(f"平均度数: {spatial['avg_degree']:.2f}")
+            
+            report.append(f"缺失值率: {missing['total_missing_rate']:.3f}%")
+            report.append(f"数据密度: {temporal['data_density']:.3f}")
+            report.append("")
+        
+        return "\n".join(report)
+    
+    def generate_comparative_analysis(self) -> str:
+        """生成对比分析报告"""
+        if not self.analysis_results:
+            return "没有可用的分析结果"
+        
+        report = []
+        report.append("=" * 80)
+        report.append("数据集对比分析")
+        report.append("=" * 80)
+        report.append("")
+        
+        # 按领域分组
+        domains = {}
+        for name, result in self.analysis_results.items():
+            domain = result['basic_info']['domain']
+            if domain not in domains:
+                domains[domain] = []
+            domains[domain].append((name, result))
+        
+        for domain, datasets in domains.items():
+            report.append(f"领域: {domain}")
+            report.append("-" * 40)
+            
+            # 该领域的数据集统计
+            missing_rates = [d[1]['missing_analysis']['total_missing_rate'] for d in datasets]
+            node_counts = [d[1]['spatial_analysis']['num_nodes'] for d in datasets]
+            time_steps = [d[1]['basic_info']['shape'][0] for d in datasets]
+            
+            report.append(f"数据集数量: {len(datasets)}")
+            report.append(f"平均缺失值率: {np.mean(missing_rates):.3f}%")
+            report.append(f"缺失值率范围: {min(missing_rates):.3f}% - {max(missing_rates):.3f}%")
+            report.append(f"平均节点数: {np.mean(node_counts):.1f}")
+            report.append(f"节点数范围: {min(node_counts)} - {max(node_counts)}")
+            report.append(f"平均时间步数: {np.mean(time_steps):.0f}")
+            report.append("")
+        
+        # 空间覆盖密度分析
+        report.append("空间覆盖密度分析:")
+        report.append("-" * 40)
+        
+        spatial_datasets = [(name, result) for name, result in self.analysis_results.items() 
+                           if result['spatial_analysis'].get('num_edges')]
+        
+        if spatial_datasets:
+            for name, result in spatial_datasets:
+                spatial = result['spatial_analysis']
+                report.append(f"{name}: {spatial['num_nodes']} 个节点, {spatial['num_edges']} 条边, "
+                             f"密度 {spatial['edge_density']:.4f}, 平均度数 {spatial['avg_degree']:.2f}")
+        else:
+            report.append("没有发现包含图结构的数据集")
+        
+        report.append("")
+        
+        # 时间连续性分析
+        report.append("时间连续性分析:")
+        report.append("-" * 40)
+        
+        temporal_data = []
+        for name, result in self.analysis_results.items():
+            temporal = result['temporal_analysis']
+            temporal_data.append({
+                'name': name,
+                'days': temporal['total_days'],
+                'density': temporal['data_density'],
+                'frequency': temporal['frequency_minutes']
+            })
+        
+        # 按时间跨度排序
+        temporal_data.sort(key=lambda x: x['days'], reverse=True)
+        
+        for data in temporal_data:
+            report.append(f"{data['name']}: {data['days']:.1f} 天, "
+                         f"数据密度 {data['density']:.3f}, "
+                         f"频率 {data['frequency']} 分钟")
+        
+        return "\n".join(report)
+    
+    def create_visualizations(self, output_dir: str = "analysis_reports"):
+        """创建可视化图表"""
+        if not self.analysis_results:
+            print("没有可用的分析结果")
+            return
+        
+        output_path = Path(output_dir)
+        output_path.mkdir(exist_ok=True)
+        
+        # 设置图表样式
+        plt.style.use('seaborn-v0_8')
+        
+        # 1. 缺失值率对比
+        fig, ax = plt.subplots(figsize=(12, 6))
+        names = list(self.analysis_results.keys())
+        missing_rates = [self.analysis_results[name]['missing_analysis']['total_missing_rate'] 
+                        for name in names]
+        
+        bars = ax.bar(names, missing_rates, color='skyblue', alpha=0.7)
+        ax.set_title('各数据集缺失值率对比', fontsize=14, fontweight='bold')
+        ax.set_xlabel('数据集名称')
+        ax.set_ylabel('缺失值率 (%)')
+        ax.tick_params(axis='x', rotation=45)
+        
+        # 添加数值标签
+        for bar, rate in zip(bars, missing_rates):
+            ax.text(bar.get_x() + bar.get_width()/2, bar.get_height() + 0.1,
+                   f'{rate:.2f}%', ha='center', va='bottom')
+        
+        plt.tight_layout()
+        plt.savefig(output_path / "missing_rates_comparison.png", dpi=300, bbox_inches='tight')
+        plt.close()
+        
+        # 2. 节点数量对比
+        fig, ax = plt.subplots(figsize=(12, 6))
+        node_counts = [self.analysis_results[name]['spatial_analysis']['num_nodes'] 
+                      for name in names]
+        
+        bars = ax.bar(names, node_counts, color='lightgreen', alpha=0.7)
+        ax.set_title('各数据集节点数量对比', fontsize=14, fontweight='bold')
+        ax.set_xlabel('数据集名称')
+        ax.set_ylabel('节点数量')
+        ax.tick_params(axis='x', rotation=45)
+        
+        # 添加数值标签
+        for bar, count in zip(bars, node_counts):
+            ax.text(bar.get_x() + bar.get_width()/2, bar.get_height() + max(node_counts)*0.01,
+                   f'{count}', ha='center', va='bottom')
+        
+        plt.tight_layout()
+        plt.savefig(output_path / "node_counts_comparison.png", dpi=300, bbox_inches='tight')
+        plt.close()
+        
+        # 3. 时间跨度对比
+        fig, ax = plt.subplots(figsize=(12, 6))
+        time_days = [self.analysis_results[name]['temporal_analysis']['total_days'] 
+                    for name in names]
+        
+        bars = ax.bar(names, time_days, color='orange', alpha=0.7)
+        ax.set_title('各数据集时间跨度对比', fontsize=14, fontweight='bold')
+        ax.set_xlabel('数据集名称')
+        ax.set_ylabel('时间跨度 (天)')
+        ax.tick_params(axis='x', rotation=45)
+        
+        # 添加数值标签
+        for bar, days in zip(bars, time_days):
+            ax.text(bar.get_x() + bar.get_width()/2, bar.get_height() + max(time_days)*0.01,
+                   f'{days:.1f}', ha='center', va='bottom')
+        
+        plt.tight_layout()
+        plt.savefig(output_path / "time_span_comparison.png", dpi=300, bbox_inches='tight')
+        plt.close()
+        
+        # 4. 散点图：节点数 vs 缺失值率
+        fig, ax = plt.subplots(figsize=(10, 6))
+        ax.scatter(node_counts, missing_rates, s=100, alpha=0.7, c='red')
+        
+        # 添加数据集标签
+        for i, name in enumerate(names):
+            ax.annotate(name, (node_counts[i], missing_rates[i]), 
+                       xytext=(5, 5), textcoords='offset points', fontsize=8)
+        
+        ax.set_xlabel('节点数量')
+        ax.set_ylabel('缺失值率 (%)')
+        ax.set_title('节点数量与缺失值率关系', fontsize=14, fontweight='bold')
+        ax.grid(True, alpha=0.3)
+        
+        plt.tight_layout()
+        plt.savefig(output_path / "nodes_vs_missing_rates.png", dpi=300, bbox_inches='tight')
+        plt.close()
+        
+        print(f"可视化图表已保存到目录: {output_path}")
+    
+    def save_reports(self, output_dir: str = "analysis_reports"):
+        """保存分析报告"""
+        output_path = Path(output_dir)
+        output_path.mkdir(exist_ok=True)
+        
+        # 保存汇总报告
+        summary_report = self.generate_summary_report()
+        with open(output_path / "summary_report.txt", 'w', encoding='utf-8') as f:
+            f.write(summary_report)
+        
+        # 保存对比分析报告
+        comparative_report = self.generate_comparative_analysis()
+        with open(output_path / "comparative_analysis.txt", 'w', encoding='utf-8') as f:
+            f.write(comparative_report)
+        
+        # 保存详细JSON报告
+        with open(output_path / "detailed_analysis.json", 'w', encoding='utf-8') as f:
+            json.dump(self.analysis_results, f, indent=2, ensure_ascii=False, default=str)
+        
+        print(f"报告已保存到目录: {output_path}")
--- a/scripts/run_dataset_analysis.py
+++ b/scripts/run_dataset_analysis.py
@ -0,0 +1,62 @@
+#!/usr/bin/env python3
+# -*- coding: utf-8 -*-
+"""
+数据集分析主脚本
+运行完整的数据集分析流程
+"""
+
+import sys
+import os
+from pathlib import Path
+
+# 添加项目根目录到Python路径
+project_root = Path(__file__).parent.parent
+sys.path.insert(0, str(project_root))
+
+from scripts.dataset_analyzer import DatasetAnalyzer
+from scripts.report_generator import ReportGenerator
+
+def main():
+    """主函数"""
+    print("BasicTS 数据集分析工具")
+    print("=" * 50)
+    
+    # 创建分析器
+    analyzer = DatasetAnalyzer()
+    
+    # 分析所有数据集
+    print("开始分析数据集...")
+    analysis_results = analyzer.analyze_all_datasets()
+    
+    if not analysis_results:
+        print("没有找到可分析的数据集")
+        return
+    
+    # 创建报告生成器
+    report_generator = ReportGenerator(analysis_results)
+    
+    # 生成并打印报告
+    print("\n" + "=" * 80)
+    print("数据集分析报告")
+    print("=" * 80)
+    
+    summary_report = report_generator.generate_summary_report()
+    print(summary_report)
+    
+    print("\n" + "=" * 80)
+    print("对比分析报告")
+    print("=" * 80)
+    
+    comparative_report = report_generator.generate_comparative_analysis()
+    print(comparative_report)
+    
+    # 保存报告和可视化
+    print("\n正在保存报告和可视化图表...")
+    report_generator.save_reports()
+    report_generator.create_visualizations()
+    
+    print("\n分析完成！")
+    print("报告文件保存在 'analysis_reports' 目录中")
+
+if __name__ == "__main__":
+    main()