兼容BJTaxi

2025-11-23 19:04:50 +08:00 · 2025-11-23 19:04:50 +08:00 · 475a4788cd
parent b7ea73bc92
commit 475a4788cd
4 changed files with 93 additions and 21 deletions
--- a/.vscode/launch.json
+++ b/.vscode/launch.json
@ -4,13 +4,6 @@
    // 欲了解更多信息，请访问: https://go.microsoft.com/fwlink/?linkid=830387
    "version": "0.2.0",
    "configurations": [
        {
            "name": "Python Debugger: Current File",
            "type": "debugpy",
            "request": "launch",
            "program": "${file}",
            "console": "integratedTerminal"
        },
        {
            "name": "STID_PEMS-BAY",
            "type": "debugpy",
@ -35,6 +28,14 @@
            "console": "integratedTerminal",
            "args": "--config ./config/REPST/PEMSD8.yaml"
        },
        {
            "name": "REPST-BJTaxi-InFlow",
            "type": "debugpy",
            "request": "launch",
            "program": "run.py",
            "console": "integratedTerminal",
            "args": "--config ./config/REPST/BJTaxi-Inflow.yaml"
        },
        {
            "name": "REPST-PEMSBAY",
            "type": "debugpy",
--- a/config/REPST/BJTaxi-Inflow.yaml
+++ b/config/REPST/BJTaxi-Inflow.yaml
@ -0,0 +1,60 @@
 basic:
  dataset: "BJTaxi-InFlow"
  mode : "train"
  device : "cuda:0"
  model: "REPST"
  seed: 2023
 data:
  add_day_in_week: false
  add_time_in_day: false
  column_wise: false
  days_per_week: 7
  default_graph: true
  horizon: 24
  lag: 24
  normalizer: std
  num_nodes: 1024
  steps_per_day: 48
  test_ratio: 0.2
  tod: false
  val_ratio: 0.2
  sample: 1
  input_dim: 1
  batch_size: 16
 model:
  pred_len: 24
  seq_len: 24
  patch_len: 6 
  stride: 7
  dropout: 0.2
  gpt_layers: 9
  d_ff: 128
  gpt_path: ./GPT-2
  d_model: 64
  n_heads: 1
  input_dim: 1
  word_num: 1000
 train:
  batch_size: 16
  early_stop: true
  early_stop_patience: 15
  epochs: 100
  grad_norm: false
  loss_func: mae
  lr_decay: true
  lr_decay_rate: 0.3
  lr_decay_step: "5,20,40,70"
  lr_init: 0.003
  max_grad_norm: 5
  real_value: true
  weight_decay: 0
  debug: false
  output_dim: 1
  log_step: 100
  plot: false
  mae_thresh: None
  mape_thresh: 0.001
--- a/dataloader/data_selector.py
+++ b/dataloader/data_selector.py
@ -55,6 +55,10 @@ def load_st_dataset(config):
        case "SD":
            data_path = os.path.join("./data/SD/data.npz")
            data = np.load(data_path)["data"][:, :, 0].astype(np.float32)
        case "BJTaxi-InFlow":
            data = read_BeijingTaxi()[:, :, 0:1].astype(np.float32)
        case "BJTaxi-OutFlow":
            data = read_BeijingTaxi()[:, :, 1:2].astype(np.float32)
        case _:
            raise ValueError(f"Unsupported dataset: {dataset}")
@ -65,3 +69,16 @@ def load_st_dataset(config):
    print("加载 %s 数据集中... " % dataset)
    # return data[::sample]
    return data
 def read_BeijingTaxi():
    files = ["TaxiBJ2013.npy", "TaxiBJ2014.npy", "TaxiBJ2015.npy",
                          "TaxiBJ2016_1.npy", "TaxiBJ2016_2.npy"]
    all_data = []
    for file in files:
        data_path = os.path.join(f"./data/BeijingTaxi/{file}")
        data = np.load(data_path)
        all_data.append(data)
    all_data = np.concatenate(all_data, axis=0)
    time_num = all_data.shape[0]
    all_data = all_data.transpose(0, 2, 3, 1).reshape(time_num, 32*32, 2)
    return all_data
--- a/utils/Download_data.py
+++ b/utils/Download_data.py
@ -4,12 +4,8 @@ from tqdm import tqdm
 import kagglehub
 import py7zr  
 # ---------- 1. 加载结构 JSON ----------
 def load_structure_json(path="utils/dataset.json"):
    with open(path, "r", encoding="utf-8") as f:
        return json.load(f)
-# ---------- 2. 检测完整性 ----------
+# ---------- 1. 检测完整性 ----------
 def detect_data_integrity(data_dir, expected):
    missing_list = []
    if not os.path.isdir(data_dir): 
@ -53,7 +49,7 @@ def detect_data_integrity(data_dir, expected):
    # print(f"缺失数据集：{missing_list}")
    return missing_list
-# ---------- 3. 下载 7z 并解压 ----------
+# ---------- 2. 下载 7z 并解压 ----------
 def download_and_extract(url, dst_dir, max_retries=3):
    os.makedirs(dst_dir, exist_ok=True)
    filename = os.path.basename(urlsplit(url).path) or "download.7z"
@ -77,7 +73,7 @@ def download_and_extract(url, dst_dir, max_retries=3):
            if attempt==max_retries: raise RuntimeError("下载或解压失败") 
            print("错误，重试中...", e)
-# ---------- 4. 下载 Kaggle 数据 ----------
+# ---------- 3. 下载 Kaggle 数据 ----------
 def download_kaggle_data(base_dir, dataset):
    try:
        print(f"Downloading kaggle dataset : {dataset}")
@ -86,7 +82,7 @@ def download_kaggle_data(base_dir, dataset):
    except Exception as e:
        print("Kaggle 下载失败:", dataset, e)
-# ---------- 5. 下载 GitHub 数据 ---------- 
+# ---------- 4. 下载 GitHub 数据 ---------- 
 def download_github_data(file_path, save_dir):
    if not os.path.exists(save_dir):
        os.makedirs(save_dir)
@ -136,13 +132,13 @@ def rearrange_dir():
 # ---------- 6. 主流程 ----------
 def check_and_download_data():
    # 加载结构文件，检测缺失数据集
    cwd = os.getcwd()
    data_dir = os.path.join(cwd,"data")
-    file_tree = load_structure_json()
+    with open("utils/dataset.json", "r", encoding="utf-8") as f:
-
+        file_tree = json.load(f)
    # 执行一次检测，获取所有缺失项
    missing_list = detect_data_integrity(data_dir, file_tree)
-    print(f"缺失数据集：{missing_list}")
+    # print(f"缺失数据集：{missing_list}")
    # 检查并下载adj数据
    if "adj" in missing_list:
@ -167,7 +163,6 @@ def check_and_download_data():
        missing_list = detect_data_integrity(data_dir, file_tree)
    # 检查并下载pems, bay, metr-la, solar-energy数据
    # 定义数据集名称到Kaggle数据集的映射
    kaggle_map = {
        "PEMS03": "elmahy/pems-dataset",
        "PEMS04": "elmahy/pems-dataset",
@ -178,7 +173,6 @@ def check_and_download_data():
        "SolarEnergy": "wangshaoqi/solar-energy"
    }
    # 检查是否有需要从Kaggle下载的数据集
    # 先对kaggle下载地址进行去重，避免重复下载相同的数据集
    downloaded_kaggle_datasets = set()