e21-e26无改进

2025-04-21 20:31:09 +08:00 · 2025-04-21 20:31:09 +08:00 · e851eb21d6
parent 0b006087ea
commit e851eb21d6
17 changed files with 2936 additions and 53 deletions
--- a/baseline.ipynb
+++ b/baseline.ipynb
--- a/config/EXP/PEMSD4.yaml
+++ b/config/EXP/PEMSD4.yaml
@ -27,7 +27,7 @@ train:
  epochs: 300
  lr_init: 0.003
  weight_decay: 0
-  lr_decay: True
+  lr_decay: False
  lr_decay_rate: 0.5
  lr_decay_step: "5,20,40,65"
  early_stop: True
--- a/config/EXP/SD.yaml
+++ b/config/EXP/SD.yaml
@ -14,18 +14,10 @@ data:
  days_per_week: 7
 model:
  batch_size: 64
  input_dim: 1
  output_dim: 1
-  embed_dim: 12
+  in_len: 12
  rnn_units: 64
  num_layers: 1
  cheb_order: 2
  use_day: True
  use_week: True
  graph_size: 30
  expert_nums: 8
  top_k: 2
  hidden_dim: 64
 train:
  loss_func: mae
--- a/config/STID/PEMSD4.yaml
+++ b/config/STID/PEMSD4.yaml
@ -0,0 +1,58 @@
 data:
  num_nodes: 307
  lag: 12
  horizon: 12
  val_ratio: 0.2
  test_ratio: 0.2
  tod: False
  normalizer: std
  column_wise: False
  default_graph: True
  add_time_in_day: True
  add_day_in_week: True
  steps_per_day: 288
  days_per_week: 7
 model:
  input_dim: 3
  output_dim: 1
  history: 12
  horizon: 12
  num_nodes: 307
  input_len: 12
  embed_dim": 32
  output_len: 12
  num_layer: 3
  if_node: True
  node_dim: 32
  if_T_i_D: True
  if_D_i_W: True
  temp_dim_tid: 32
  temp_dim_diw: 32
  time_of_day_size: 288
  day_of_week_size: 7
 train:
  loss_func: mae
  seed: 1
  batch_size: 64
  epochs: 300
  lr_init: 0.002
  weight_decay: 0.0001
  lr_decay: False
  lr_decay_rate: 0.3
  lr_decay_step: "1,50,80"
  early_stop: True
  early_stop_patience: 15
  grad_norm: False
  max_grad_norm: 5
  real_value: True
 test:
  mae_thresh: null
  mape_thresh: 0.0
 log:
  log_step: 200
  plot: False
--- a/lib/initializer.py
+++ b/lib/initializer.py
@ -12,6 +12,8 @@ def init_model(args, device):
            nn.init.xavier_uniform_(p)
        else:
            nn.init.uniform_(p)
    total_params = sum(p.numel() for p in model.parameters())
    print(f"Model has {total_params} parameters")
    return model
 def init_optimizer(model, args):
--- a/model/EXP/EXP2.py
+++ b/model/EXP/EXP2.py
@ -21,7 +21,7 @@ class PositionalEncoding(nn.Module):
        return x + self.pe[:T].unsqueeze(1)            # (T,1,d_model) 广播到 (T,B,d_model)
-class TemporalTransformerForecast(nn.Module):
+class EXP(nn.Module):
    """
    Transformer-based 多步预测：
      - 只使用 x[...,0] 作为输入通道
--- a/model/EXP/EXP21.py
+++ b/model/EXP/EXP21.py
@ -4,7 +4,7 @@ import torch.nn.functional as F
 """
-使用多层感知机替换输入输出的proj层
+添加时间嵌入
 """
 class DynamicGraphConstructor(nn.Module):
@ -104,6 +104,7 @@ class EXP(nn.Module):
        self.time_embedding = nn.Embedding(self.time_slots, self.hidden_dim)
        self.day_embedding  = nn.Embedding(7, self.hidden_dim)
        # input projection now still only takes the flow history
        self.input_proj = MLP(
            in_dim      = self.seq_len,
--- a/model/EXP/EXP22.py
+++ b/model/EXP/EXP22.py
@ -2,11 +2,10 @@ import torch
 import torch.nn as nn
 import torch.nn.functional as F
 """
 添加空间嵌入
 """
 """
 使用多层感知机替换输入输出的 proj 层，
 并在 EXP 模型中添加显式的空间嵌入（Spatial Embedding）。
 """
 class DynamicGraphConstructor(nn.Module):
    def __init__(self, node_num, embed_dim):
@ -35,8 +34,8 @@ class GraphConvBlock(nn.Module):
    def forward(self, x, adj):
        # x: (B, N, F_in), adj: (N, N)
        res = x
-        x = torch.matmul(adj, x)      # 邻接乘特征
+        x = torch.matmul(adj, x)  # 邻接乘特征
-        x = self.theta(x)             # 线性变换
+        x = self.theta(x)  # 线性变换
        # 残差连接
        x = x + (res if self.residual else self.res_proj(res))
        return F.relu(x)
@ -90,7 +89,7 @@ class MLP(nn.Module):
        dims = [in_dim] + hidden_dims + [out_dim]
        layers = []
        for i in range(len(dims) - 2):
-            layers += [nn.Linear(dims[i], dims[i+1]), activation()]
+            layers += [nn.Linear(dims[i], dims[i + 1]), activation()]
        layers += [nn.Linear(dims[-2], dims[-1])]
        self.net = nn.Sequential(*layers)
@ -103,17 +102,18 @@ class EXP(nn.Module):
    def __init__(self, args):
        super().__init__()
        # 训练 & 输出参数
-        self.horizon    = args['horizon']
+        self.horizon = args['horizon']
        self.output_dim = args['output_dim']
-        self.seq_len    = args.get('in_len', 12)
+        self.seq_len = args.get('in_len', 12)
        self.hidden_dim = args.get('hidden_dim', 64)
-        self.num_nodes  = args['num_nodes']
+        self.num_nodes = args['num_nodes']
-        self.embed_dim  = args.get('embed_dim', 16)
+        self.embed_dim = args.get('embed_dim', 16)
        # ==== 时间嵌入 ====
-        self.time_slots     = args.get('time_slots', 24 * 60 // args.get('time_slot', 5))
+        self.time_slots = args.get('time_slots', 24 * 60 // args.get('time_slot', 5))
        self.time_embedding = nn.Embedding(self.time_slots, self.hidden_dim)
-        self.day_embedding  = nn.Embedding(7, self.hidden_dim)
+        self.day_embedding = nn.Embedding(7, self.hidden_dim)
        self.node_emb = nn.Parameter(torch.empty(self.num_nodes, self.embed_dim))
        # ==== 空间嵌入 ====
        # 每个节点一个可学习的向量
@ -124,9 +124,9 @@ class EXP(nn.Module):
        # 输入投影：仅对流量序列做 MLP
        self.input_proj = MLP(
-            in_dim      = self.seq_len,
+            in_dim=self.seq_len,
-            hidden_dims = [self.hidden_dim],
+            hidden_dims=[self.hidden_dim],
-            out_dim     = self.hidden_dim
+            out_dim=self.hidden_dim
        )
        # 两个 SandwichBlock
@ -135,9 +135,9 @@ class EXP(nn.Module):
        # 输出投影
        self.out_proj = MLP(
-            in_dim      = self.hidden_dim,
+            in_dim=self.hidden_dim,
-            hidden_dims = [2 * self.hidden_dim],
+            hidden_dims=[2 * self.hidden_dim],
-            out_dim     = self.horizon * self.output_dim
+            out_dim=self.horizon * self.output_dim
        )
    def forward(self, x):
@ -151,7 +151,7 @@ class EXP(nn.Module):
        # 拆分三条序列
        x_flow = x[..., 0]  # (B, T, N)
        x_time = x[..., 1]  # (B, T, N)
-        x_day  = x[..., 2]  # (B, T, N)
+        x_day = x[..., 2]  # (B, T, N)
        B, T, N = x_flow.shape
        assert T == self.seq_len, f"序列长度应为 {self.seq_len}，但收到 {T}"
@ -162,14 +162,16 @@ class EXP(nn.Module):
        # 2) 计算离散时间嵌入
        t_idx = (x_time[:, -1, :] * (self.time_slots - 1)).long()  # (B, N)
-        d_idx = x_day[:,  -1, :].long()                           # (B, N)
+        d_idx = x_day[:, -1, :].long()  # (B, N)
-        time_emb = self.time_embedding(t_idx)                     # (B, N, hidden_dim)
+        time_emb = self.time_embedding(t_idx)  # (B, N, hidden_dim)
-        day_emb  = self.day_embedding(d_idx)                      # (B, N, hidden_dim)
+        day_emb = self.day_embedding(d_idx)  # (B, N, hidden_dim)
        # 3) 计算空间嵌入并扩展到 batch 大小
-        node_idx    = torch.arange(N, device=x.device)            # (N,)
+        # node_emb = []
-        spatial_emb = self.spatial_embedding[node_idx]            # (N, hidden_dim)
+        # node_emb.append(self.node_emb.unsqueeze(0).expand(
-        spatial_emb = spatial_emb.unsqueeze(0).expand(B, -1, -1)  # (B, N, hidden_dim)
+        #     B, -1, -1).transpose(1, 2).unsqueeze(-1))
        # spatial_emb = torch.stack(node_emb)
        spatial_emb = self.spatial_embedding.unsqueeze(0).expand(B, N, self.hidden_dim)  # -> (B, N, hidden_dim)
        # 4) 将三种嵌入相加到 h0
        h0 = h0 + time_emb + day_emb + spatial_emb
@ -180,7 +182,7 @@ class EXP(nn.Module):
        h2 = self.sandwich2(h1)
        # 6) 输出投影 -> (B, horizon, N, output_dim)
-        out = self.out_proj(h2)                   # (B, N, horizon*out_dim)
+        out = self.out_proj(h2)  # (B, N, horizon*out_dim)
        out = out.view(B, N, self.horizon, self.output_dim)
-        out = out.permute(0, 2, 1, 3)             # (B, horizon, N, output_dim)
+        out = out.permute(0, 2, 1, 3)  # (B, horizon, N, output_dim)
        return out
--- a/model/EXP/EXP23.py
+++ b/model/EXP/EXP23.py
@ -0,0 +1,159 @@
 import torch
 import torch.nn as nn
 import torch.nn.functional as F
 """
 添加时间嵌入 + 基于可学习邻接矩阵的图构造
 """
 class DynamicGraphConstructor(nn.Module):
    def __init__(self, node_num):
        super().__init__()
        # 直接用一个 N×N 的可学习参数矩阵来表示邻接
        self.adj_param = nn.Parameter(torch.randn(node_num, node_num), requires_grad=True)
    def forward(self):
        # 非线性截断，去除负边
        adj = F.relu(self.adj_param)
        # 行归一化
        adj = F.softmax(adj, dim=-1)
        return adj
 class GraphConvBlock(nn.Module):
    def __init__(self, input_dim, output_dim):
        super().__init__()
        self.theta = nn.Linear(input_dim, output_dim)
        self.residual = (input_dim == output_dim)
        if not self.residual:
            self.res_proj = nn.Linear(input_dim, output_dim)
    def forward(self, x, adj):
        # x: (B, N, C)
        res = x
        # 邻接乘特征
        x = torch.matmul(adj, x)
        x = self.theta(x)
        x = x + (res if self.residual else self.res_proj(res))
        return F.relu(x)
 class MANBA_Block(nn.Module):
    def __init__(self, input_dim, hidden_dim):
        super().__init__()
        self.attn = nn.MultiheadAttention(embed_dim=input_dim, num_heads=4, batch_first=True)
        self.ffn = nn.Sequential(
            nn.Linear(input_dim, hidden_dim),
            nn.ReLU(),
            nn.Linear(hidden_dim, input_dim)
        )
        self.norm1 = nn.LayerNorm(input_dim)
        self.norm2 = nn.LayerNorm(input_dim)
    def forward(self, x):
        # x: (B, N, C)
        res = x
        x_attn, _ = self.attn(x, x, x)
        x = self.norm1(res + x_attn)
        res2 = x
        x_ffn = self.ffn(x)
        x = self.norm2(res2 + x_ffn)
        return x
 class SandwichBlock(nn.Module):
    def __init__(self, num_nodes, hidden_dim):
        super().__init__()
        self.manba1 = MANBA_Block(hidden_dim, hidden_dim * 2)
        self.graph_constructor = DynamicGraphConstructor(num_nodes)
        self.gc = GraphConvBlock(hidden_dim, hidden_dim)
        self.manba2 = MANBA_Block(hidden_dim, hidden_dim * 2)
    def forward(self, h):
        # h: (B, N, C)
        h1 = self.manba1(h)
        adj = self.graph_constructor()    # (N, N)
        h2 = self.gc(h1, adj)
        h3 = self.manba2(h2)
        return h3
 class MLP(nn.Module):
    def __init__(self, in_dim, hidden_dims, out_dim, activation=nn.ReLU):
        super().__init__()
        dims = [in_dim] + hidden_dims + [out_dim]
        layers = []
        for i in range(len(dims) - 2):
            layers += [nn.Linear(dims[i], dims[i + 1]), activation()]
        layers += [nn.Linear(dims[-2], dims[-1])]
        self.net = nn.Sequential(*layers)
    def forward(self, x):
        return self.net(x)
 class EXP(nn.Module):
    def __init__(self, args):
        super().__init__()
        self.horizon     = args['horizon']
        self.output_dim  = args['output_dim']
        self.seq_len     = args.get('in_len', 12)
        self.hidden_dim  = args.get('hidden_dim', 64)
        self.num_nodes   = args['num_nodes']
        # ==== 离散时间嵌入 ====
        self.time_slots = args.get('time_slots', 24 * 60 // args.get('time_slot', 5))
        self.time_embedding = nn.Embedding(self.time_slots, self.hidden_dim)
        self.day_embedding  = nn.Embedding(7, self.hidden_dim)
        # 流量历史投影
        self.input_proj = MLP(
            in_dim      = self.seq_len,
            hidden_dims = [self.hidden_dim],
            out_dim     = self.hidden_dim
        )
        # 两个 SandwichBlock
        self.sandwich1 = SandwichBlock(self.num_nodes, self.hidden_dim)
        self.sandwich2 = SandwichBlock(self.num_nodes, self.hidden_dim)
        # 输出投影
        self.out_proj = MLP(
            in_dim      = self.hidden_dim,
            hidden_dims = [2 * self.hidden_dim],
            out_dim     = self.horizon * self.output_dim
        )
    def forward(self, x):
        """
        x: (B, T, N, D_total)
           D_total >= 3:
             x[...,0] = flow,
             x[...,1] = time_in_day (0…1),
             x[...,2] = day_in_week (0…6)
        """
        x_flow = x[..., 0]  # (B, T, N)
        x_time = x[..., 1]  # (B, T, N)
        x_day  = x[..., 2]  # (B, T, N)
        B, T, N = x_flow.shape
        assert T == self.seq_len
        # 1) 投影流量历史
        x_flat = x_flow.permute(0, 2, 1).reshape(B * N, T)
        h0 = self.input_proj(x_flat).view(B, N, self.hidden_dim)
        # 2) 离散时间索引
        t_idx = (x_time[:, -1, :,] * (self.time_slots - 1)).long()  # (B, N)
        d_idx = x_day[:,  -1, :,].long()                            # (B, N)
        time_emb = self.time_embedding(t_idx)
        day_emb  = self.day_embedding(d_idx)
        # 3) 注入时间嵌入
        h0 = h0 + time_emb + day_emb
        # 4) Sandwich + 残差
        h1 = self.sandwich1(h0)
        h1 = h1 + h0
        h2 = self.sandwich2(h1)
        # 5) 输出投影
        out = self.out_proj(h2)  # (B, N, horizon*output_dim)
        out = out.view(B, N, self.horizon, self.output_dim)
        out = out.permute(0, 2, 1, 3)  # (B, horizon, N, output_dim)
        return out
--- a/model/EXP/EXP24.py
+++ b/model/EXP/EXP24.py
@ -0,0 +1,168 @@
 import torch
 import torch.nn as nn
 import torch.nn.functional as F
 """
 添加时间嵌入 + 三重残差
 """
 class DynamicGraphConstructor(nn.Module):
    def __init__(self, node_num, embed_dim):
        super().__init__()
        self.nodevec1 = nn.Parameter(torch.randn(node_num, embed_dim), requires_grad=True)
        self.nodevec2 = nn.Parameter(torch.randn(node_num, embed_dim), requires_grad=True)
    def forward(self):
        adj = torch.matmul(self.nodevec1, self.nodevec2.T)
        adj = F.relu(adj)
        adj = F.softmax(adj, dim=-1)
        return adj
 class GraphConvBlock(nn.Module):
    def __init__(self, input_dim, output_dim):
        super().__init__()
        self.theta = nn.Linear(input_dim, output_dim)
        self.residual = (input_dim == output_dim)
        if not self.residual:
            self.res_proj = nn.Linear(input_dim, output_dim)
    def forward(self, x, adj):
        res = x
        x = torch.matmul(adj, x)
        x = self.theta(x)
        x = x + (res if self.residual else self.res_proj(res))
        return F.relu(x)
 class MANBA_Block(nn.Module):
    def __init__(self, input_dim, hidden_dim):
        super().__init__()
        self.attn = nn.MultiheadAttention(embed_dim=input_dim, num_heads=4, batch_first=True)
        self.ffn = nn.Sequential(
            nn.Linear(input_dim, hidden_dim),
            nn.ReLU(),
            nn.Linear(hidden_dim, input_dim)
        )
        self.norm1 = nn.LayerNorm(input_dim)
        self.norm2 = nn.LayerNorm(input_dim)
    def forward(self, x):
        res = x
        x_attn, _ = self.attn(x, x, x)
        x = self.norm1(res + x_attn)
        res2 = x
        x_ffn = self.ffn(x)
        x = self.norm2(res2 + x_ffn)
        return x
 class SandwichBlock(nn.Module):
    def __init__(self, num_nodes, embed_dim, hidden_dim):
        super().__init__()
        self.manba1 = MANBA_Block(hidden_dim, hidden_dim * 2)
        self.graph_constructor = DynamicGraphConstructor(num_nodes, embed_dim)
        self.gc = GraphConvBlock(hidden_dim, hidden_dim)
        self.manba2 = MANBA_Block(hidden_dim, hidden_dim * 2)
    def forward(self, h):
        h1 = self.manba1(h)
        adj = self.graph_constructor()
        h2 = self.gc(h1, adj)
        h3 = self.manba2(h2)
        return h3  # 不在这里加残差，留给上层 EXP 统一处理
 class MLP(nn.Module):
    def __init__(self, in_dim, hidden_dims, out_dim, activation=nn.ReLU):
        super().__init__()
        dims = [in_dim] + hidden_dims + [out_dim]
        layers = []
        for i in range(len(dims)-2):
            layers += [nn.Linear(dims[i], dims[i+1]), activation()]
        layers += [nn.Linear(dims[-2], dims[-1])]
        self.net = nn.Sequential(*layers)
    def forward(self, x):
        return self.net(x)
 class EXP(nn.Module):
    def __init__(self, args):
        super().__init__()
        self.horizon     = args['horizon']
        self.output_dim  = args['output_dim']
        self.seq_len     = args.get('in_len', 12)
        self.hidden_dim  = args.get('hidden_dim', 64)
        self.num_nodes   = args['num_nodes']
        self.embed_dim   = args.get('embed_dim', 16)
        # ==== 离散时间嵌入 ====
        self.time_slots = args.get('time_slots', 24 * 60 // args.get('time_slot', 5))
        self.time_embedding = nn.Embedding(self.time_slots, self.hidden_dim)
        self.day_embedding  = nn.Embedding(7, self.hidden_dim)
        # 流量历史投影
        self.input_proj = MLP(
            in_dim      = self.seq_len,
            hidden_dims = [self.hidden_dim],
            out_dim     = self.hidden_dim
        )
        # 两个 SandwichBlock
        self.sandwich1 = SandwichBlock(self.num_nodes, self.embed_dim, self.hidden_dim)
        self.sandwich2 = SandwichBlock(self.num_nodes, self.embed_dim, self.hidden_dim)
        # 输出投影
        self.out_proj = MLP(
            in_dim      = self.hidden_dim,
            hidden_dims = [2 * self.hidden_dim],
            out_dim     = self.horizon * self.output_dim
        )
    def forward(self, x):
        """
        x: (B, T, N, D_total)
           D_total >= 3:
             x[...,0] = flow,
             x[...,1] = time_in_day (0…1),
             x[...,2] = day_in_week (0…6)
        """
        x_flow = x[..., 0]  # (B, T, N)
        x_time = x[..., 1]  # (B, T, N)
        x_day  = x[..., 2]  # (B, T, N)
        B, T, N = x_flow.shape
        assert T == self.seq_len
        # 1) 投影流量历史
        x_flat = x_flow.permute(0, 2, 1).reshape(B * N, T)
        h0 = self.input_proj(x_flat).view(B, N, self.hidden_dim)
        # 2) 离散时间索引
        t_idx = (x_time[:, -1, :,] * (self.time_slots - 1)).long()  # (B, N)
        d_idx = x_day[:,  -1, :,].long()                            # (B, N)
        time_emb = self.time_embedding(t_idx)
        day_emb  = self.day_embedding(d_idx)
        # 3) 注入时间嵌入
        h0 = h0 + time_emb + day_emb
        # ==== 三重残差 ====
        # 第一重：Sandwich1 + 残差
        h1 = self.sandwich1(h0)
        h1 = h1 + h0
        # 第二重：Sandwich2 + 残差
        h2 = self.sandwich2(h1)
        h2 = h2 + h1
        # 第三重：全局残差 (直接连接到最初 h0)
        h3 = h2 + h0
        # 5) 输出投影
        out = self.out_proj(h3)                 # (B, N, horizon*output_dim)
        out = out.view(B, N, self.horizon, self.output_dim)
        out = out.permute(0, 2, 1, 3)           # (B, horizon, N, output_dim)
        return out
--- a/model/EXP/EXP25.py
+++ b/model/EXP/EXP25.py
@ -0,0 +1,196 @@
 import torch
 import torch.nn as nn
 import torch.nn.functional as F
 class DynamicTanh(nn.Module):
    """
    Dynamic tanh activation with learnable scaling (alpha) and affine transformation (weight, bias).
    """
    def __init__(self, normalized_shape, channels_last=True, alpha_init_value=0.5):
        super().__init__()
        self.normalized_shape = normalized_shape
        self.alpha_init_value = alpha_init_value
        self.channels_last = channels_last
        # learnable scale for tanh
        self.alpha = nn.Parameter(torch.full((1,), alpha_init_value))
        # affine parameters
        self.weight = nn.Parameter(torch.ones(normalized_shape))
        self.bias = nn.Parameter(torch.zeros(normalized_shape))
    def forward(self, x):
        # scaled tanh
        x = torch.tanh(self.alpha * x)
        # affine transform
        if self.channels_last:
            x = x * self.weight + self.bias
        else:
            # channels_first: assume shape (B, C, H, W)
            x = x * self.weight[:, None, None] + self.bias[:, None, None]
        return x
    def extra_repr(self):
        return f"normalized_shape={self.normalized_shape}, alpha_init_value={self.alpha_init_value}, channels_last={self.channels_last}"
 class DynamicGraphConstructor(nn.Module):
    def __init__(self, node_num, embed_dim):
        super().__init__()
        self.nodevec1 = nn.Parameter(torch.randn(node_num, embed_dim), requires_grad=True)
        self.nodevec2 = nn.Parameter(torch.randn(node_num, embed_dim), requires_grad=True)
    def forward(self):
        adj = torch.matmul(self.nodevec1, self.nodevec2.T)
        adj = F.relu(adj)
        adj = F.softmax(adj, dim=-1)
        return adj
 class GraphConvBlock(nn.Module):
    def __init__(self, input_dim, output_dim):
        super().__init__()
        self.theta = nn.Linear(input_dim, output_dim)
        self.residual = (input_dim == output_dim)
        if not self.residual:
            self.res_proj = nn.Linear(input_dim, output_dim)
    def forward(self, x, adj):
        res = x
        x = torch.matmul(adj, x)
        x = self.theta(x)
        x = x + (res if self.residual else self.res_proj(res))
        return F.relu(x)
 class MANBA_Block(nn.Module):
    """
    Multi-head attention + feed-forward network with DynamicTanh replacing LayerNorm.
    """
    def __init__(self, input_dim, hidden_dim):
        super().__init__()
        self.attn = nn.MultiheadAttention(embed_dim=input_dim, num_heads=4, batch_first=True)
        self.ffn = nn.Sequential(
            nn.Linear(input_dim, hidden_dim),
            nn.ReLU(),
            nn.Linear(hidden_dim, input_dim)
        )
        # replace LayerNorm with DynamicTanh
        self.norm1 = DynamicTanh(normalized_shape=input_dim, channels_last=True)
        self.norm2 = DynamicTanh(normalized_shape=input_dim, channels_last=True)
    def forward(self, x):
        # self-attention
        res = x
        x_attn, _ = self.attn(x, x, x)
        x = self.norm1(res + x_attn)
        # feed-forward
        res2 = x
        x_ffn = self.ffn(x)
        x = self.norm2(res2 + x_ffn)
        return x
 class SandwichBlock(nn.Module):
    def __init__(self, num_nodes, embed_dim, hidden_dim):
        super().__init__()
        self.manba1 = MANBA_Block(hidden_dim, hidden_dim * 2)
        self.graph_constructor = DynamicGraphConstructor(num_nodes, embed_dim)
        self.gc = GraphConvBlock(hidden_dim, hidden_dim)
        self.manba2 = MANBA_Block(hidden_dim, hidden_dim * 2)
    def forward(self, h):
        h1 = self.manba1(h)
        adj = self.graph_constructor()
        h2 = self.gc(h1, adj)
        h3 = self.manba2(h2)
        return h3
 class MLP(nn.Module):
    def __init__(self, in_dim, hidden_dims, out_dim, activation=nn.ReLU):
        super().__init__()
        dims = [in_dim] + hidden_dims + [out_dim]
        layers = []
        for i in range(len(dims) - 2):
            layers.append(nn.Linear(dims[i], dims[i+1]))
            layers.append(activation())
        layers.append(nn.Linear(dims[-2], dims[-1]))
        self.net = nn.Sequential(*layers)
    def forward(self, x):
        return self.net(x)
 class EXP(nn.Module):
    def __init__(self, args):
        super().__init__()
        self.horizon     = args['horizon']
        self.output_dim  = args['output_dim']
        self.seq_len     = args.get('in_len', 12)
        self.hidden_dim  = args.get('hidden_dim', 64)
        self.num_nodes   = args['num_nodes']
        self.embed_dim   = args.get('embed_dim', 16)
        # discrete time embeddings
        self.time_slots    = args.get('time_slots', 24 * 60 // args.get('time_slot', 5))
        self.time_embedding = nn.Embedding(self.time_slots, self.hidden_dim)
        self.day_embedding  = nn.Embedding(7, self.hidden_dim)
        # input projection for flow history
        self.input_proj = MLP(
            in_dim      = self.seq_len,
            hidden_dims = [self.hidden_dim],
            out_dim     = self.hidden_dim
        )
        # two Sandwich blocks
        self.sandwich1 = SandwichBlock(self.num_nodes, self.embed_dim, self.hidden_dim)
        self.sandwich2 = SandwichBlock(self.num_nodes, self.embed_dim, self.hidden_dim)
        # output projection
        self.out_proj = MLP(
            in_dim      = self.hidden_dim,
            hidden_dims = [2 * self.hidden_dim],
            out_dim     = self.horizon * self.output_dim
        )
    def forward(self, x):
        """
        x: (B, T, N, D_total) where
           x[...,0]=flow, x[...,1]=time_in_day (scaled), x[...,2]=day_in_week
        """
        x_flow = x[..., 0]  # (B, T, N)
        x_time = x[..., 1]  # (B, T, N)
        x_day  = x[..., 2]  # (B, T, N)
        B, T, N = x_flow.shape
        assert T == self.seq_len, "Input sequence length mismatch"
        # project flow history
        x_flat = x_flow.permute(0, 2, 1).reshape(B * N, T)
        h0 = self.input_proj(x_flat).view(B, N, self.hidden_dim)
        # time embeddings at last step
        t_idx = (x_time[:, -1, :] * (self.time_slots - 1)).long()
        d_idx = x_day[:,  -1, :].long()
        time_emb = self.time_embedding(t_idx)
        day_emb  = self.day_embedding(d_idx)
        # inject time features
        h0 = h0 + time_emb + day_emb
        # Sandwich + residuals
        h1 = self.sandwich1(h0) + h0
        h2 = self.sandwich2(h1)
        # output
        out = self.out_proj(h2)
        out = out.view(B, N, self.horizon, self.output_dim)
        out = out.permute(0, 2, 1, 3)
        return out
 # Example usage:
 # args = {'horizon':12, 'output_dim':1, 'num_nodes':170}
 # model = EXP(args)
 # print(model)
--- a/model/EXP/EXP26.py
+++ b/model/EXP/EXP26.py
@ -0,0 +1,195 @@
 import torch
 import torch.nn as nn
 import torch.nn.functional as F
 """
 添加时间嵌入 + 引入图注意力网络（GAT）
 """
 class DynamicGraphConstructor(nn.Module):
    def __init__(self, node_num, embed_dim):
        super().__init__()
        self.nodevec1 = nn.Parameter(torch.randn(node_num, embed_dim), requires_grad=True)
        self.nodevec2 = nn.Parameter(torch.randn(node_num, embed_dim), requires_grad=True)
    def forward(self):
        adj = torch.matmul(self.nodevec1, self.nodevec2.T)
        adj = F.relu(adj)
        adj = F.softmax(adj, dim=-1)
        return adj
 # 原来的 GCN 块保留备用
 class GraphConvBlock(nn.Module):
    def __init__(self, input_dim, output_dim):
        super().__init__()
        self.theta = nn.Linear(input_dim, output_dim)
        self.residual = (input_dim == output_dim)
        if not self.residual:
            self.res_proj = nn.Linear(input_dim, output_dim)
    def forward(self, x, adj):
        res = x
        x = torch.matmul(adj, x)
        x = self.theta(x)
        x = x + (res if self.residual else self.res_proj(res))
        return F.relu(x)
 # ★★ GAT 部分：从 LeronQ/GCN_predict-Pytorch 改写而来 ★★
 class GraphAttentionLayer(nn.Module):
    def __init__(self, in_c, out_c):
        super().__init__()
        self.W = nn.Linear(in_c, out_c, bias=False)
        self.b = nn.Parameter(torch.Tensor(out_c))
        nn.init.xavier_uniform_(self.W.weight)
        nn.init.zeros_(self.b)
    def forward(self, h, adj):
        # h: [B, N, C_in], adj: [N, N]
        Wh = self.W(h)  # [B, N, C_out]
        # 计算注意力得分
        score = torch.bmm(Wh, Wh.transpose(1, 2)) * adj.unsqueeze(0)  # [B, N, N]
        score = score.masked_fill(score == 0, -1e16)
        alpha = F.softmax(score, dim=-1)  # [B, N, N]
        # 加权求和并加偏置
        out = torch.bmm(alpha, Wh) + self.b  # [B, N, C_out]
        return F.relu(out)
 class GraphAttentionBlock(nn.Module):
    def __init__(self, input_dim, output_dim, n_heads=4):
        super().__init__()
        # 多头注意力
        self.heads = nn.ModuleList([GraphAttentionLayer(input_dim, output_dim) for _ in range(n_heads)])
        # 合并后再做一次线性映射
        self.out_att = GraphAttentionLayer(output_dim * n_heads, output_dim)
        self.act = nn.ReLU()
    def forward(self, x, adj):
        # x: [B, N, C], adj: [N, N]
        # 并行多头，然后拼接
        h_cat = torch.cat([head(x, adj) for head in self.heads], dim=-1)  # [B, N, output_dim * n_heads]
        h_out = self.out_att(h_cat, adj)  # [B, N, output_dim]
        return self.act(h_out)
 class MANBA_Block(nn.Module):
    def __init__(self, input_dim, hidden_dim):
        super().__init__()
        self.attn = nn.MultiheadAttention(embed_dim=input_dim, num_heads=4, batch_first=True)
        self.ffn = nn.Sequential(
            nn.Linear(input_dim, hidden_dim),
            nn.ReLU(),
            nn.Linear(hidden_dim, input_dim)
        )
        self.norm1 = nn.LayerNorm(input_dim)
        self.norm2 = nn.LayerNorm(input_dim)
    def forward(self, x):
        res = x
        x_attn, _ = self.attn(x, x, x)
        x = self.norm1(res + x_attn)
        res2 = x
        x_ffn = self.ffn(x)
        x = self.norm2(res2 + x_ffn)
        return x
 class SandwichBlock(nn.Module):
    def __init__(self, num_nodes, embed_dim, hidden_dim):
        super().__init__()
        self.manba1 = MANBA_Block(hidden_dim, hidden_dim * 2)
        self.graph_constructor = DynamicGraphConstructor(num_nodes, embed_dim)
        # ★★ 替换为 GATBlock ★★
        self.gc = GraphAttentionBlock(hidden_dim, hidden_dim, n_heads=4)
        self.manba2 = MANBA_Block(hidden_dim, hidden_dim * 2)
    def forward(self, h):
        h1 = self.manba1(h)
        adj = self.graph_constructor()
        h2 = self.gc(h1, adj)
        h3 = self.manba2(h2)
        return h3
 class MLP(nn.Module):
    def __init__(self, in_dim, hidden_dims, out_dim, activation=nn.ReLU):
        super().__init__()
        dims = [in_dim] + hidden_dims + [out_dim]
        layers = []
        for i in range(len(dims)-2):
            layers += [nn.Linear(dims[i], dims[i+1]), activation()]
        layers += [nn.Linear(dims[-2], dims[-1])]
        self.net = nn.Sequential(*layers)
    def forward(self, x):
        return self.net(x)
 class EXP(nn.Module):
    def __init__(self, args):
        super().__init__()
        self.horizon     = args['horizon']
        self.output_dim  = args['output_dim']
        self.seq_len     = args.get('in_len', 12)
        self.hidden_dim  = args.get('hidden_dim', 64)
        self.num_nodes   = args['num_nodes']
        self.embed_dim   = args.get('embed_dim', 16)
        # ==== 新增：离散时间嵌入 ====
        self.time_slots = args.get('time_slots', 24 * 60 // args.get('time_slot', 5))
        self.time_embedding = nn.Embedding(self.time_slots, self.hidden_dim)
        self.day_embedding  = nn.Embedding(7, self.hidden_dim)
        # 输入投影（仅 flow）
        self.input_proj = MLP(
            in_dim      = self.seq_len,
            hidden_dims = [self.hidden_dim],
            out_dim     = self.hidden_dim
        )
        # 两个 SandwichBlock（已替换为 GAT）
        self.sandwich1 = SandwichBlock(self.num_nodes, self.embed_dim, self.hidden_dim)
        self.sandwich2 = SandwichBlock(self.num_nodes, self.embed_dim, self.hidden_dim)
        # 输出投影
        self.out_proj = MLP(
            in_dim      = self.hidden_dim,
            hidden_dims = [2 * self.hidden_dim],
            out_dim     = self.horizon * self.output_dim
        )
    def forward(self, x):
        """
        x: (B, T, N, D_total)
           D_total >= 3, x[...,0]=flow, x[...,1]=time_in_day, x[...,2]=day_in_week
        """
        x_flow = x[..., 0]  # (B, T, N)
        x_time = x[..., 1]  # (B, T, N)
        x_day  = x[..., 2]  # (B, T, N)
        B, T, N = x_flow.shape
        assert T == self.seq_len
        # 1) 投影流量历史
        x_flat = x_flow.permute(0, 2, 1).reshape(B * N, T)
        h0 = self.input_proj(x_flat).view(B, N, self.hidden_dim)
        # 2) 取最后一步的时间索引并嵌入
        t_idx = (x_time[:, -1, :,] * (self.time_slots - 1)).long()
        d_idx = x_day[:,  -1, :,].long()
        time_emb = self.time_embedding(t_idx)
        day_emb  = self.day_embedding(d_idx)
        # 3) 注入时间信息
        h0 = h0 + time_emb + day_emb
        # 4) Sandwich + 残差
        h1 = self.sandwich1(h0)
        h1 = h1 + h0
        h2 = self.sandwich2(h1)
        # 5) 输出
        out = self.out_proj(h2)
        out = out.view(B, N, self.horizon, self.output_dim).permute(0, 2, 1, 3)
        return out
--- a/model/EXP/EXP27.py
+++ b/model/EXP/EXP27.py
@ -0,0 +1,170 @@
 import torch
 import torch.nn as nn
 import torch.nn.functional as F
 class MANBA_Block(nn.Module):
    def __init__(self, input_dim, hidden_dim):
        super().__init__()
        self.attn = nn.MultiheadAttention(embed_dim=input_dim, num_heads=4, batch_first=True)
        self.ffn = nn.Sequential(
            nn.Linear(input_dim, hidden_dim),
            nn.ReLU(),
            nn.Linear(hidden_dim, input_dim)
        )
        self.norm1 = nn.LayerNorm(input_dim)
        self.norm2 = nn.LayerNorm(input_dim)
    def forward(self, x):
        # x: (B, N, input_dim)
        res = x
        x_attn, _ = self.attn(x, x, x)
        x = self.norm1(res + x_attn)
        res2 = x
        x_ffn = self.ffn(x)
        x = self.norm2(res2 + x_ffn)
        return x
 class ExpertBlock(nn.Module):
    """
    Mixture-of-Experts block: routes each node's representation to a selected expert or a shared expert.
    """
    def __init__(self, hidden_dim, num_experts):
        super().__init__()
        self.num_experts = num_experts
        # gating network projects to num_experts + 1 (extra shared expert)
        self.gate = nn.Linear(hidden_dim, num_experts + 1)
        # per-expert FFNs
        self.experts = nn.ModuleList([
            nn.Sequential(
                nn.Linear(hidden_dim, hidden_dim * 2),
                nn.ReLU(),
                nn.Linear(hidden_dim * 2, hidden_dim)
            ) for _ in range(num_experts)
        ])
        # shared expert
        self.shared_expert = nn.Sequential(
            nn.Linear(hidden_dim, hidden_dim * 2),
            nn.ReLU(),
            nn.Linear(hidden_dim * 2, hidden_dim)
        )
    def forward(self, x):
        # x: (B, N, hidden_dim)
        B, N, D = x.shape
        # flatten to (B*N, D)
        flat = x.view(B * N, D)
        # compute gating scores and select expert per node
        scores = F.softmax(self.gate(flat), dim=-1)  # (B*N, num_experts+1)
        idx = scores.argmax(dim=-1)                  # (B*N,)
        out_flat = torch.zeros_like(flat)
        # apply each expert
        for e in range(self.num_experts):
            mask = (idx == e)
            if mask.any():
                out_flat[mask] = self.experts[e](flat[mask])
        # apply shared expert for last index
        shared_mask = (idx == self.num_experts)
        if shared_mask.any():
            out_flat[shared_mask] = self.shared_expert(flat[shared_mask])
        # reshape back to (B, N, D)
        return out_flat.view(B, N, D)
 class MLP(nn.Module):
    def __init__(self, in_dim, hidden_dims, out_dim, activation=nn.ReLU):
        super().__init__()
        dims = [in_dim] + hidden_dims + [out_dim]
        layers = []
        for i in range(len(dims) - 2):
            layers += [nn.Linear(dims[i], dims[i+1]), activation()]
        layers += [nn.Linear(dims[-2], dims[-1])]
        self.net = nn.Sequential(*layers)
    def forward(self, x):
        return self.net(x)
 class SandwichBlock(nn.Module):
    def __init__(self, num_nodes, embed_dim, hidden_dim, num_experts):
        super().__init__()
        self.manba1 = MANBA_Block(hidden_dim, hidden_dim * 2)
        self.expert_block = ExpertBlock(hidden_dim, num_experts)
        self.manba2 = MANBA_Block(hidden_dim, hidden_dim * 2)
    def forward(self, h):
        h1 = self.manba1(h)
        h2 = self.expert_block(h1)
        h3 = self.manba2(h2)
        return h3
 class EXP(nn.Module):
    def __init__(self, args):
        super().__init__()
        self.horizon     = args['horizon']
        self.output_dim  = args['output_dim']
        self.seq_len     = args.get('in_len', 12)
        self.hidden_dim  = args.get('hidden_dim', 64)
        self.num_nodes   = args['num_nodes']
        self.embed_dim   = args.get('embed_dim', 16)
        self.num_experts = args.get('num_experts', 8)  # number of private experts
        # discrete time embeddings
        self.time_slots    = args.get('time_slots', 24 * 60 // args.get('time_slot', 5))
        self.time_embedding = nn.Embedding(self.time_slots, self.hidden_dim)
        self.day_embedding  = nn.Embedding(7, self.hidden_dim)
        # input projection
        self.input_proj = MLP(
            in_dim      = self.seq_len,
            hidden_dims = [self.hidden_dim],
            out_dim     = self.hidden_dim
        )
        # two Sandwich blocks with MoE
        self.sandwich1 = SandwichBlock(self.num_nodes, self.embed_dim, self.hidden_dim, self.num_experts)
        self.sandwich2 = SandwichBlock(self.num_nodes, self.embed_dim, self.hidden_dim, self.num_experts)
        # output projection
        self.out_proj = MLP(
            in_dim      = self.hidden_dim,
            hidden_dims = [2 * self.hidden_dim],
            out_dim     = self.horizon * self.output_dim
        )
    def forward(self, x):
        """
        x: (B, T, N, D_total)
           x[...,0]= flow, x[...,1]=time_in_day, x[...,2]=day_in_week
        """
        x_flow = x[..., 0]
        x_time = x[..., 1]
        x_day  = x[..., 2]
        B, T, N = x_flow.shape
        assert T == self.seq_len
        # project flow history
        x_flat = x_flow.permute(0, 2, 1).reshape(B * N, T)
        h0 = self.input_proj(x_flat).view(B, N, self.hidden_dim)
        # time & day embeddings at last step
        t_idx = (x_time[:, -1, :,] * (self.time_slots - 1)).long()
        d_idx = x_day[:,  -1, :,].long()
        time_emb = self.time_embedding(t_idx)
        day_emb  = self.day_embedding(d_idx)
        h0 = h0 + time_emb + day_emb
        # two MoE Sandwich blocks + residuals
        h1 = self.sandwich1(h0) + h0
        h2 = self.sandwich2(h1) + h1
        # output
        out = self.out_proj(h2)
        out = out.view(B, N, self.horizon, self.output_dim)
        out = out.permute(0, 2, 1, 3)
        return out
--- a/model/EXP/EXP8b.py
+++ b/model/EXP/EXP8b.py
@ -0,0 +1,133 @@
 import torch
 import torch.nn as nn
 import torch.nn.functional as F
 """
 含残差版本
 """
 class DynamicGraphConstructor(nn.Module):
    def __init__(self, node_num, embed_dim):
        super().__init__()
        self.nodevec1 = nn.Parameter(torch.randn(node_num, embed_dim), requires_grad=True)
        self.nodevec2 = nn.Parameter(torch.randn(node_num, embed_dim), requires_grad=True)
    def forward(self):
        # (N, D) @ (D, N) -> (N, N)
        adj = torch.matmul(self.nodevec1, self.nodevec2.T)
        adj = F.relu(adj)
        adj = F.softmax(adj, dim=-1)
        return adj
 class GraphConvBlock(nn.Module):
    def __init__(self, input_dim, output_dim):
        super().__init__()
        self.theta = nn.Linear(input_dim, output_dim)
        self.residual = input_dim == output_dim
        if not self.residual:
            self.res_proj = nn.Linear(input_dim, output_dim)
    def forward(self, x, adj):
        # x: (B, N, C) / adj: (N, N)
        res = x
        x = torch.matmul(adj, x)  # (B, N, C)
        x = self.theta(x)
        # 残差连接
        if self.residual:
            x = x + res
        else:
            x = x + self.res_proj(res)
        return F.relu(x)
 class MANBA_Block(nn.Module):
    def __init__(self, input_dim, hidden_dim):
        super().__init__()
        self.attn = nn.MultiheadAttention(embed_dim=input_dim, num_heads=4, batch_first=True)
        self.ffn = nn.Sequential(
            nn.Linear(input_dim, hidden_dim),
            nn.ReLU(),
            nn.Linear(hidden_dim, input_dim)
        )
        self.norm1 = nn.LayerNorm(input_dim)
        self.norm2 = nn.LayerNorm(input_dim)
    def forward(self, x):
        # x: (B, T, C)
        res = x
        x_attn, _ = self.attn(x, x, x)
        x = self.norm1(res + x_attn)
        res2 = x
        x_ffn = self.ffn(x)
        x = self.norm2(res2 + x_ffn)
        return x
 class EXP(nn.Module):
    def __init__(self, args):
        super().__init__()
        self.horizon = args['horizon']
        self.output_dim = args['output_dim']
        self.seq_len = args.get('in_len', 12)
        self.hidden_dim = args.get('hidden_dim', 64)
        self.num_nodes = args['num_nodes']
        self.time_slots = args.get('time_slots', 24 * 60 // args.get('time_slot', 5))
        self.time_embedding = nn.Embedding(self.time_slots, self.hidden_dim)
        self.day_embedding  = nn.Embedding(7, self.hidden_dim)
        # 动态图构建
        self.graph = DynamicGraphConstructor(self.num_nodes, embed_dim=16)
        # 输入映射层
        self.input_proj = nn.Linear(self.seq_len, self.hidden_dim)
        # 图卷积
        self.gc = GraphConvBlock(self.hidden_dim, self.hidden_dim)
        # MANBA block
        self.manba = MANBA_Block(self.hidden_dim, self.hidden_dim * 2)
        # 输出映射
        self.out_proj = nn.Linear(self.hidden_dim, self.horizon * self.output_dim)
    def forward(self, x):
        # x: (B, T, N, D_total)
        x_time = x[..., 1]  # (B, T, N)
        x_day  = x[..., 2]  # (B, T, N)
        x = x[..., 0]  # 只用主通道 (B, T, N)
        B, T, N = x.shape
        assert T == self.seq_len
        # 输入投影 (B, T, N) -> (B, N, T) -> (B*N, T) -> (B*N, H)
        x = x.permute(0, 2, 1).reshape(B * N, T)
        h = self.input_proj(x)  # (B*N, hidden_dim)
        h = h.view(B, N, self.hidden_dim)
        t_idx = (x_time[:, -1, :,] * (self.time_slots - 1)).long()  # (B, N)
        d_idx = x_day[:,  -1, :,].long()                            # (B, N)
        time_emb = self.time_embedding(t_idx)   # (B, N, hidden_dim)
        day_emb  = self.day_embedding(d_idx)    # (B, N, hidden_dim)
        # 3) inject them into the initial hidden state
        h = h + time_emb + day_emb
        # 动态图构建
        adj = self.graph()  # (N, N)
        # 空间建模：图卷积
        h = self.gc(h, adj)  # (B, N, hidden_dim)
        # 时间建模：MANBA
        h = self.manba(h)  # (B, N, hidden_dim)
        # 输出映射
        out = self.out_proj(h)  # (B, N, horizon * output_dim)
        out = out.view(B, N, self.horizon, self.output_dim).permute(0, 2, 1, 3)
        return out  # (B, horizon, N, output_dim)
--- a/model/STID/MLP.py
+++ b/model/STID/MLP.py
@ -0,0 +1,29 @@
 import torch
 from torch import nn
 class MultiLayerPerceptron(nn.Module):
    """Multi-Layer Perceptron with residual links."""
    def __init__(self, input_dim, hidden_dim) -> None:
        super().__init__()
        self.fc1 = nn.Conv2d(
            in_channels=input_dim,  out_channels=hidden_dim, kernel_size=(1, 1), bias=True)
        self.fc2 = nn.Conv2d(
            in_channels=hidden_dim, out_channels=hidden_dim, kernel_size=(1, 1), bias=True)
        self.act = nn.ReLU()
        self.drop = nn.Dropout(p=0.15)
    def forward(self, input_data: torch.Tensor) -> torch.Tensor:
        """Feed forward of MLP.
        Args:
            input_data (torch.Tensor): input data with shape [B, D, N]
        Returns:
            torch.Tensor: latent repr
        """
        hidden = self.fc2(self.drop(self.act(self.fc1(input_data))))      # MLP
        hidden = hidden + input_data                           # residual
        return hidden
--- a/model/STID/STID.py
+++ b/model/STID/STID.py
@ -0,0 +1,117 @@
 import torch
 from torch import nn
 from model.STID.MLP import MultiLayerPerceptron
 class STID(nn.Module):
    """
    Paper: Spatial-Temporal Identity: A Simple yet Effective Baseline for Multivariate Time Series Forecasting
    Link: https://arxiv.org/abs/2208.05233
    Official Code: https://github.com/zezhishao/STID
    """
    def __init__(self, model_args):
        super().__init__()
        # attributes
        self.num_nodes = model_args["num_nodes"]
        self.node_dim = model_args["node_dim"]
        self.input_len = model_args["input_len"]
        self.input_dim = model_args["input_dim"]
        self.embed_dim = model_args["embed_dim"]
        self.output_len = model_args["output_len"]
        self.num_layer = model_args["num_layer"]
        self.temp_dim_tid = model_args["temp_dim_tid"]
        self.temp_dim_diw = model_args["temp_dim_diw"]
        self.time_of_day_size = model_args["time_of_day_size"]
        self.day_of_week_size = model_args["day_of_week_size"]
        self.if_time_in_day = model_args["if_T_i_D"]
        self.if_day_in_week = model_args["if_D_i_W"]
        self.if_spatial = model_args["if_node"]
        # spatial embeddings
        if self.if_spatial:
            self.node_emb = nn.Parameter(torch.empty(self.num_nodes, self.node_dim))
            nn.init.xavier_uniform_(self.node_emb)
        # temporal embeddings
        if self.if_time_in_day:
            self.time_in_day_emb = nn.Parameter(
                torch.empty(self.time_of_day_size, self.temp_dim_tid))
            nn.init.xavier_uniform_(self.time_in_day_emb)
        if self.if_day_in_week:
            self.day_in_week_emb = nn.Parameter(
                torch.empty(self.day_of_week_size, self.temp_dim_diw))
            nn.init.xavier_uniform_(self.day_in_week_emb)
        # embedding layer
        self.time_series_emb_layer = nn.Conv2d(
            in_channels=self.input_dim * self.input_len, out_channels=self.embed_dim, kernel_size=(1, 1), bias=True)
        # encoding
        self.hidden_dim = self.embed_dim+self.node_dim * \
            int(self.if_spatial)+self.temp_dim_tid*int(self.if_day_in_week) + \
            self.temp_dim_diw*int(self.if_time_in_day)
        self.encoder = nn.Sequential(
            *[MultiLayerPerceptron(self.hidden_dim, self.hidden_dim) for _ in range(self.num_layer)])
        # regression
        self.regression_layer = nn.Conv2d(
            in_channels=self.hidden_dim, out_channels=self.output_len, kernel_size=(1, 1), bias=True)
    def forward(self, history_data: torch.Tensor) -> torch.Tensor:
        """Feed forward of STID.
        Args:
            history_data (torch.Tensor): history data with shape [B, L, N, C]
        Returns:
            torch.Tensor: prediction with shape [B, L, N, C]
        """
        # prepare data
        input_data = history_data[..., range(self.input_dim)]
        # input_data = history_data[..., 0:1]
        if self.if_time_in_day:
            t_i_d_data = history_data[..., 1]
            # In the datasets used in STID, the time_of_day feature is normalized to [0, 1]. We multiply it by 288 to get the index.
            # If you use other datasets, you may need to change this line.
            time_in_day_emb = self.time_in_day_emb[(t_i_d_data[:, -1, :] * self.time_of_day_size).type(torch.LongTensor)]
        else:
            time_in_day_emb = None
        if self.if_day_in_week:
            d_i_w_data = history_data[..., 2]
            day_in_week_emb = self.day_in_week_emb[(d_i_w_data[:, -1, :] * self.day_of_week_size).type(torch.LongTensor)]
        else:
            day_in_week_emb = None
        # time series embedding
        batch_size, _, num_nodes, _ = input_data.shape
        input_data = input_data.transpose(1, 2).contiguous()
        input_data = input_data.view(
            batch_size, num_nodes, -1).transpose(1, 2).unsqueeze(-1)
        time_series_emb = self.time_series_emb_layer(input_data)
        node_emb = []
        if self.if_spatial:
            # expand node embeddings
            node_emb.append(self.node_emb.unsqueeze(0).expand(
                batch_size, -1, -1).transpose(1, 2).unsqueeze(-1))
        # temporal embeddings
        tem_emb = []
        if time_in_day_emb is not None:
            tem_emb.append(time_in_day_emb.transpose(1, 2).unsqueeze(-1))
        if day_in_week_emb is not None:
            tem_emb.append(day_in_week_emb.transpose(1, 2).unsqueeze(-1))
        # concate all embeddings
        hidden = torch.cat([time_series_emb] + node_emb + tem_emb, dim=1)
        # encoding
        hidden = self.encoder(hidden)
        # regression
        prediction = self.regression_layer(hidden)
        return prediction
--- a/model/model_selector.py
+++ b/model/model_selector.py
@ -13,7 +13,8 @@ from model.STFGNN.STFGNN import STFGNN
 from model.STSGCN.STSGCN import STSGCN
 from model.STGODE.STGODE import ODEGCN
 from model.PDG2SEQ.PDG2Seq import PDG2Seq
-from model.EXP.EXP21 import EXP as EXP
+from model.STID.STID import STID
 from model.EXP.EXP26 import EXP as EXP
 def model_selector(model):
    match model['type']:
@ -32,5 +33,6 @@ def model_selector(model):
        case 'STSGCN': return STSGCN(model)
        case 'STGODE': return ODEGCN(model)
        case 'PDG2SEQ': return PDG2Seq(model)
        case 'STID': return STID(model)
        case 'EXP': return EXP(model)