e21-e26无改进

2025-04-21 20:31:09 +08:00 · 2025-04-21 20:31:09 +08:00 · e851eb21d6
parent 0b006087ea
commit e851eb21d6
17 changed files with 2936 additions and 53 deletions
--- a/baseline.ipynb
+++ b/baseline.ipynb
--- a/config/EXP/PEMSD4.yaml
+++ b/config/EXP/PEMSD4.yaml
@ -27,7 +27,7 @@ train:
  epochs: 300
  lr_init: 0.003
  weight_decay: 0
-  lr_decay: True
+  lr_decay: False
  lr_decay_rate: 0.5
  lr_decay_step: "5,20,40,65"
  early_stop: True
--- a/config/EXP/SD.yaml
+++ b/config/EXP/SD.yaml
@ -14,18 +14,10 @@ data:
  days_per_week: 7

 model:
+  batch_size: 64
  input_dim: 1
  output_dim: 1
-  embed_dim: 12
-  rnn_units: 64
-  num_layers: 1
-  cheb_order: 2
-  use_day: True
-  use_week: True
-  graph_size: 30
-  expert_nums: 8
-  top_k: 2
-  hidden_dim: 64
+  in_len: 12

 train:
  loss_func: mae
--- a/config/STID/PEMSD4.yaml
+++ b/config/STID/PEMSD4.yaml
@ -0,0 +1,58 @@
+data:
+  num_nodes: 307
+  lag: 12
+  horizon: 12
+  val_ratio: 0.2
+  test_ratio: 0.2
+  tod: False
+  normalizer: std
+  column_wise: False
+  default_graph: True
+  add_time_in_day: True
+  add_day_in_week: True
+  steps_per_day: 288
+  days_per_week: 7
+
+model:
+  input_dim: 3
+  output_dim: 1
+  history: 12
+  horizon: 12
+  num_nodes: 307
+  input_len: 12
+  embed_dim": 32
+  output_len: 12
+  num_layer: 3
+  if_node: True
+  node_dim: 32
+  if_T_i_D: True
+  if_D_i_W: True
+  temp_dim_tid: 32
+  temp_dim_diw: 32
+  time_of_day_size: 288
+  day_of_week_size: 7
+
+
+train:
+  loss_func: mae
+  seed: 1
+  batch_size: 64
+  epochs: 300
+  lr_init: 0.002
+  weight_decay: 0.0001
+  lr_decay: False
+  lr_decay_rate: 0.3
+  lr_decay_step: "1,50,80"
+  early_stop: True
+  early_stop_patience: 15
+  grad_norm: False
+  max_grad_norm: 5
+  real_value: True
+
+test:
+  mae_thresh: null
+  mape_thresh: 0.0
+
+log:
+  log_step: 200
+  plot: False
--- a/lib/initializer.py
+++ b/lib/initializer.py
@ -12,6 +12,8 @@ def init_model(args, device):
            nn.init.xavier_uniform_(p)
        else:
            nn.init.uniform_(p)
+    total_params = sum(p.numel() for p in model.parameters())
+    print(f"Model has {total_params} parameters")
    return model

 def init_optimizer(model, args):
--- a/model/EXP/EXP2.py
+++ b/model/EXP/EXP2.py
@ -21,7 +21,7 @@ class PositionalEncoding(nn.Module):
        return x + self.pe[:T].unsqueeze(1)            # (T,1,d_model) 广播到 (T,B,d_model)


-class TemporalTransformerForecast(nn.Module):
+class EXP(nn.Module):
    """
    Transformer-based 多步预测：
      - 只使用 x[...,0] 作为输入通道
--- a/model/EXP/EXP21.py
+++ b/model/EXP/EXP21.py
@ -4,7 +4,7 @@ import torch.nn.functional as F


 """
-使用多层感知机替换输入输出的proj层
+添加时间嵌入
 """

 class DynamicGraphConstructor(nn.Module):
@ -104,6 +104,7 @@ class EXP(nn.Module):
        self.time_embedding = nn.Embedding(self.time_slots, self.hidden_dim)
        self.day_embedding  = nn.Embedding(7, self.hidden_dim)

+
        # input projection now still only takes the flow history
        self.input_proj = MLP(
            in_dim      = self.seq_len,
--- a/model/EXP/EXP22.py
+++ b/model/EXP/EXP22.py
@ -2,11 +2,10 @@ import torch
 import torch.nn as nn
 import torch.nn.functional as F

+"""
+添加空间嵌入
+"""

-"""
-使用多层感知机替换输入输出的 proj 层，
-并在 EXP 模型中添加显式的空间嵌入（Spatial Embedding）。
-"""

 class DynamicGraphConstructor(nn.Module):
    def __init__(self, node_num, embed_dim):
@ -114,6 +113,7 @@ class EXP(nn.Module):
        self.time_slots = args.get('time_slots', 24 * 60 // args.get('time_slot', 5))
        self.time_embedding = nn.Embedding(self.time_slots, self.hidden_dim)
        self.day_embedding = nn.Embedding(7, self.hidden_dim)
+        self.node_emb = nn.Parameter(torch.empty(self.num_nodes, self.embed_dim))

        # ==== 空间嵌入 ====
        # 每个节点一个可学习的向量
@ -167,9 +167,11 @@ class EXP(nn.Module):
        day_emb = self.day_embedding(d_idx)  # (B, N, hidden_dim)

        # 3) 计算空间嵌入并扩展到 batch 大小
-        node_idx    = torch.arange(N, device=x.device)            # (N,)
-        spatial_emb = self.spatial_embedding[node_idx]            # (N, hidden_dim)
-        spatial_emb = spatial_emb.unsqueeze(0).expand(B, -1, -1)  # (B, N, hidden_dim)
+        # node_emb = []
+        # node_emb.append(self.node_emb.unsqueeze(0).expand(
+        #     B, -1, -1).transpose(1, 2).unsqueeze(-1))
+        # spatial_emb = torch.stack(node_emb)
+        spatial_emb = self.spatial_embedding.unsqueeze(0).expand(B, N, self.hidden_dim)  # -> (B, N, hidden_dim)

        # 4) 将三种嵌入相加到 h0
        h0 = h0 + time_emb + day_emb + spatial_emb
--- a/model/EXP/EXP23.py
+++ b/model/EXP/EXP23.py
@ -0,0 +1,159 @@
+import torch
+import torch.nn as nn
+import torch.nn.functional as F
+
+"""
+添加时间嵌入 + 基于可学习邻接矩阵的图构造
+"""
+
+class DynamicGraphConstructor(nn.Module):
+    def __init__(self, node_num):
+        super().__init__()
+        # 直接用一个 N×N 的可学习参数矩阵来表示邻接
+        self.adj_param = nn.Parameter(torch.randn(node_num, node_num), requires_grad=True)
+
+    def forward(self):
+        # 非线性截断，去除负边
+        adj = F.relu(self.adj_param)
+        # 行归一化
+        adj = F.softmax(adj, dim=-1)
+        return adj
+
+class GraphConvBlock(nn.Module):
+    def __init__(self, input_dim, output_dim):
+        super().__init__()
+        self.theta = nn.Linear(input_dim, output_dim)
+        self.residual = (input_dim == output_dim)
+        if not self.residual:
+            self.res_proj = nn.Linear(input_dim, output_dim)
+
+    def forward(self, x, adj):
+        # x: (B, N, C)
+        res = x
+        # 邻接乘特征
+        x = torch.matmul(adj, x)
+        x = self.theta(x)
+        x = x + (res if self.residual else self.res_proj(res))
+        return F.relu(x)
+
+class MANBA_Block(nn.Module):
+    def __init__(self, input_dim, hidden_dim):
+        super().__init__()
+        self.attn = nn.MultiheadAttention(embed_dim=input_dim, num_heads=4, batch_first=True)
+        self.ffn = nn.Sequential(
+            nn.Linear(input_dim, hidden_dim),
+            nn.ReLU(),
+            nn.Linear(hidden_dim, input_dim)
+        )
+        self.norm1 = nn.LayerNorm(input_dim)
+        self.norm2 = nn.LayerNorm(input_dim)
+
+    def forward(self, x):
+        # x: (B, N, C)
+        res = x
+        x_attn, _ = self.attn(x, x, x)
+        x = self.norm1(res + x_attn)
+        res2 = x
+        x_ffn = self.ffn(x)
+        x = self.norm2(res2 + x_ffn)
+        return x
+
+class SandwichBlock(nn.Module):
+    def __init__(self, num_nodes, hidden_dim):
+        super().__init__()
+        self.manba1 = MANBA_Block(hidden_dim, hidden_dim * 2)
+        self.graph_constructor = DynamicGraphConstructor(num_nodes)
+        self.gc = GraphConvBlock(hidden_dim, hidden_dim)
+        self.manba2 = MANBA_Block(hidden_dim, hidden_dim * 2)
+
+    def forward(self, h):
+        # h: (B, N, C)
+        h1 = self.manba1(h)
+        adj = self.graph_constructor()    # (N, N)
+        h2 = self.gc(h1, adj)
+        h3 = self.manba2(h2)
+        return h3
+
+class MLP(nn.Module):
+    def __init__(self, in_dim, hidden_dims, out_dim, activation=nn.ReLU):
+        super().__init__()
+        dims = [in_dim] + hidden_dims + [out_dim]
+        layers = []
+        for i in range(len(dims) - 2):
+            layers += [nn.Linear(dims[i], dims[i + 1]), activation()]
+        layers += [nn.Linear(dims[-2], dims[-1])]
+        self.net = nn.Sequential(*layers)
+
+    def forward(self, x):
+        return self.net(x)
+
+class EXP(nn.Module):
+    def __init__(self, args):
+        super().__init__()
+        self.horizon     = args['horizon']
+        self.output_dim  = args['output_dim']
+        self.seq_len     = args.get('in_len', 12)
+        self.hidden_dim  = args.get('hidden_dim', 64)
+        self.num_nodes   = args['num_nodes']
+
+        # ==== 离散时间嵌入 ====
+        self.time_slots = args.get('time_slots', 24 * 60 // args.get('time_slot', 5))
+        self.time_embedding = nn.Embedding(self.time_slots, self.hidden_dim)
+        self.day_embedding  = nn.Embedding(7, self.hidden_dim)
+
+        # 流量历史投影
+        self.input_proj = MLP(
+            in_dim      = self.seq_len,
+            hidden_dims = [self.hidden_dim],
+            out_dim     = self.hidden_dim
+        )
+
+        # 两个 SandwichBlock
+        self.sandwich1 = SandwichBlock(self.num_nodes, self.hidden_dim)
+        self.sandwich2 = SandwichBlock(self.num_nodes, self.hidden_dim)
+
+        # 输出投影
+        self.out_proj = MLP(
+            in_dim      = self.hidden_dim,
+            hidden_dims = [2 * self.hidden_dim],
+            out_dim     = self.horizon * self.output_dim
+        )
+
+    def forward(self, x):
+        """
+        x: (B, T, N, D_total)
+           D_total >= 3:
+             x[...,0] = flow,
+             x[...,1] = time_in_day (0…1),
+             x[...,2] = day_in_week (0…6)
+        """
+        x_flow = x[..., 0]  # (B, T, N)
+        x_time = x[..., 1]  # (B, T, N)
+        x_day  = x[..., 2]  # (B, T, N)
+
+        B, T, N = x_flow.shape
+        assert T == self.seq_len
+
+        # 1) 投影流量历史
+        x_flat = x_flow.permute(0, 2, 1).reshape(B * N, T)
+        h0 = self.input_proj(x_flat).view(B, N, self.hidden_dim)
+
+        # 2) 离散时间索引
+        t_idx = (x_time[:, -1, :,] * (self.time_slots - 1)).long()  # (B, N)
+        d_idx = x_day[:,  -1, :,].long()                            # (B, N)
+        time_emb = self.time_embedding(t_idx)
+        day_emb  = self.day_embedding(d_idx)
+
+        # 3) 注入时间嵌入
+        h0 = h0 + time_emb + day_emb
+
+        # 4) Sandwich + 残差
+        h1 = self.sandwich1(h0)
+        h1 = h1 + h0
+        h2 = self.sandwich2(h1)
+
+        # 5) 输出投影
+        out = self.out_proj(h2)  # (B, N, horizon*output_dim)
+        out = out.view(B, N, self.horizon, self.output_dim)
+        out = out.permute(0, 2, 1, 3)  # (B, horizon, N, output_dim)
+        return out
--- a/model/EXP/EXP24.py
+++ b/model/EXP/EXP24.py
@ -0,0 +1,168 @@
+import torch
+import torch.nn as nn
+import torch.nn.functional as F
+
+
+"""
+添加时间嵌入 + 三重残差
+"""
+
+class DynamicGraphConstructor(nn.Module):
+    def __init__(self, node_num, embed_dim):
+        super().__init__()
+        self.nodevec1 = nn.Parameter(torch.randn(node_num, embed_dim), requires_grad=True)
+        self.nodevec2 = nn.Parameter(torch.randn(node_num, embed_dim), requires_grad=True)
+
+    def forward(self):
+        adj = torch.matmul(self.nodevec1, self.nodevec2.T)
+        adj = F.relu(adj)
+        adj = F.softmax(adj, dim=-1)
+        return adj
+
+
+class GraphConvBlock(nn.Module):
+    def __init__(self, input_dim, output_dim):
+        super().__init__()
+        self.theta = nn.Linear(input_dim, output_dim)
+        self.residual = (input_dim == output_dim)
+        if not self.residual:
+            self.res_proj = nn.Linear(input_dim, output_dim)
+
+    def forward(self, x, adj):
+        res = x
+        x = torch.matmul(adj, x)
+        x = self.theta(x)
+        x = x + (res if self.residual else self.res_proj(res))
+        return F.relu(x)
+
+
+class MANBA_Block(nn.Module):
+    def __init__(self, input_dim, hidden_dim):
+        super().__init__()
+        self.attn = nn.MultiheadAttention(embed_dim=input_dim, num_heads=4, batch_first=True)
+        self.ffn = nn.Sequential(
+            nn.Linear(input_dim, hidden_dim),
+            nn.ReLU(),
+            nn.Linear(hidden_dim, input_dim)
+        )
+        self.norm1 = nn.LayerNorm(input_dim)
+        self.norm2 = nn.LayerNorm(input_dim)
+
+    def forward(self, x):
+        res = x
+        x_attn, _ = self.attn(x, x, x)
+        x = self.norm1(res + x_attn)
+        res2 = x
+        x_ffn = self.ffn(x)
+        x = self.norm2(res2 + x_ffn)
+        return x
+
+
+class SandwichBlock(nn.Module):
+    def __init__(self, num_nodes, embed_dim, hidden_dim):
+        super().__init__()
+        self.manba1 = MANBA_Block(hidden_dim, hidden_dim * 2)
+        self.graph_constructor = DynamicGraphConstructor(num_nodes, embed_dim)
+        self.gc = GraphConvBlock(hidden_dim, hidden_dim)
+        self.manba2 = MANBA_Block(hidden_dim, hidden_dim * 2)
+
+    def forward(self, h):
+        h1 = self.manba1(h)
+        adj = self.graph_constructor()
+        h2 = self.gc(h1, adj)
+        h3 = self.manba2(h2)
+        return h3  # 不在这里加残差，留给上层 EXP 统一处理
+
+
+class MLP(nn.Module):
+    def __init__(self, in_dim, hidden_dims, out_dim, activation=nn.ReLU):
+        super().__init__()
+        dims = [in_dim] + hidden_dims + [out_dim]
+        layers = []
+        for i in range(len(dims)-2):
+            layers += [nn.Linear(dims[i], dims[i+1]), activation()]
+        layers += [nn.Linear(dims[-2], dims[-1])]
+        self.net = nn.Sequential(*layers)
+
+    def forward(self, x):
+        return self.net(x)
+
+
+class EXP(nn.Module):
+    def __init__(self, args):
+        super().__init__()
+        self.horizon     = args['horizon']
+        self.output_dim  = args['output_dim']
+        self.seq_len     = args.get('in_len', 12)
+        self.hidden_dim  = args.get('hidden_dim', 64)
+        self.num_nodes   = args['num_nodes']
+        self.embed_dim   = args.get('embed_dim', 16)
+
+        # ==== 离散时间嵌入 ====
+        self.time_slots = args.get('time_slots', 24 * 60 // args.get('time_slot', 5))
+        self.time_embedding = nn.Embedding(self.time_slots, self.hidden_dim)
+        self.day_embedding  = nn.Embedding(7, self.hidden_dim)
+
+        # 流量历史投影
+        self.input_proj = MLP(
+            in_dim      = self.seq_len,
+            hidden_dims = [self.hidden_dim],
+            out_dim     = self.hidden_dim
+        )
+
+        # 两个 SandwichBlock
+        self.sandwich1 = SandwichBlock(self.num_nodes, self.embed_dim, self.hidden_dim)
+        self.sandwich2 = SandwichBlock(self.num_nodes, self.embed_dim, self.hidden_dim)
+
+        # 输出投影
+        self.out_proj = MLP(
+            in_dim      = self.hidden_dim,
+            hidden_dims = [2 * self.hidden_dim],
+            out_dim     = self.horizon * self.output_dim
+        )
+
+    def forward(self, x):
+        """
+        x: (B, T, N, D_total)
+           D_total >= 3:
+             x[...,0] = flow,
+             x[...,1] = time_in_day (0…1),
+             x[...,2] = day_in_week (0…6)
+        """
+        x_flow = x[..., 0]  # (B, T, N)
+        x_time = x[..., 1]  # (B, T, N)
+        x_day  = x[..., 2]  # (B, T, N)
+
+        B, T, N = x_flow.shape
+        assert T == self.seq_len
+
+        # 1) 投影流量历史
+        x_flat = x_flow.permute(0, 2, 1).reshape(B * N, T)
+        h0 = self.input_proj(x_flat).view(B, N, self.hidden_dim)
+
+        # 2) 离散时间索引
+        t_idx = (x_time[:, -1, :,] * (self.time_slots - 1)).long()  # (B, N)
+        d_idx = x_day[:,  -1, :,].long()                            # (B, N)
+        time_emb = self.time_embedding(t_idx)
+        day_emb  = self.day_embedding(d_idx)
+
+        # 3) 注入时间嵌入
+        h0 = h0 + time_emb + day_emb
+
+        # ==== 三重残差 ====
+        # 第一重：Sandwich1 + 残差
+        h1 = self.sandwich1(h0)
+        h1 = h1 + h0
+
+        # 第二重：Sandwich2 + 残差
+        h2 = self.sandwich2(h1)
+        h2 = h2 + h1
+
+        # 第三重：全局残差 (直接连接到最初 h0)
+        h3 = h2 + h0
+
+        # 5) 输出投影
+        out = self.out_proj(h3)                 # (B, N, horizon*output_dim)
+        out = out.view(B, N, self.horizon, self.output_dim)
+        out = out.permute(0, 2, 1, 3)           # (B, horizon, N, output_dim)
+        return out
--- a/model/EXP/EXP25.py
+++ b/model/EXP/EXP25.py
@ -0,0 +1,196 @@
+import torch
+import torch.nn as nn
+import torch.nn.functional as F
+
+
+class DynamicTanh(nn.Module):
+    """
+    Dynamic tanh activation with learnable scaling (alpha) and affine transformation (weight, bias).
+    """
+    def __init__(self, normalized_shape, channels_last=True, alpha_init_value=0.5):
+        super().__init__()
+        self.normalized_shape = normalized_shape
+        self.alpha_init_value = alpha_init_value
+        self.channels_last = channels_last
+
+        # learnable scale for tanh
+        self.alpha = nn.Parameter(torch.full((1,), alpha_init_value))
+        # affine parameters
+        self.weight = nn.Parameter(torch.ones(normalized_shape))
+        self.bias = nn.Parameter(torch.zeros(normalized_shape))
+
+    def forward(self, x):
+        # scaled tanh
+        x = torch.tanh(self.alpha * x)
+        # affine transform
+        if self.channels_last:
+            x = x * self.weight + self.bias
+        else:
+            # channels_first: assume shape (B, C, H, W)
+            x = x * self.weight[:, None, None] + self.bias[:, None, None]
+        return x
+
+    def extra_repr(self):
+        return f"normalized_shape={self.normalized_shape}, alpha_init_value={self.alpha_init_value}, channels_last={self.channels_last}"
+
+
+class DynamicGraphConstructor(nn.Module):
+    def __init__(self, node_num, embed_dim):
+        super().__init__()
+        self.nodevec1 = nn.Parameter(torch.randn(node_num, embed_dim), requires_grad=True)
+        self.nodevec2 = nn.Parameter(torch.randn(node_num, embed_dim), requires_grad=True)
+
+    def forward(self):
+        adj = torch.matmul(self.nodevec1, self.nodevec2.T)
+        adj = F.relu(adj)
+        adj = F.softmax(adj, dim=-1)
+        return adj
+
+
+class GraphConvBlock(nn.Module):
+    def __init__(self, input_dim, output_dim):
+        super().__init__()
+        self.theta = nn.Linear(input_dim, output_dim)
+        self.residual = (input_dim == output_dim)
+        if not self.residual:
+            self.res_proj = nn.Linear(input_dim, output_dim)
+
+    def forward(self, x, adj):
+        res = x
+        x = torch.matmul(adj, x)
+        x = self.theta(x)
+        x = x + (res if self.residual else self.res_proj(res))
+        return F.relu(x)
+
+
+class MANBA_Block(nn.Module):
+    """
+    Multi-head attention + feed-forward network with DynamicTanh replacing LayerNorm.
+    """
+    def __init__(self, input_dim, hidden_dim):
+        super().__init__()
+        self.attn = nn.MultiheadAttention(embed_dim=input_dim, num_heads=4, batch_first=True)
+        self.ffn = nn.Sequential(
+            nn.Linear(input_dim, hidden_dim),
+            nn.ReLU(),
+            nn.Linear(hidden_dim, input_dim)
+        )
+        # replace LayerNorm with DynamicTanh
+        self.norm1 = DynamicTanh(normalized_shape=input_dim, channels_last=True)
+        self.norm2 = DynamicTanh(normalized_shape=input_dim, channels_last=True)
+
+    def forward(self, x):
+        # self-attention
+        res = x
+        x_attn, _ = self.attn(x, x, x)
+        x = self.norm1(res + x_attn)
+        # feed-forward
+        res2 = x
+        x_ffn = self.ffn(x)
+        x = self.norm2(res2 + x_ffn)
+        return x
+
+
+class SandwichBlock(nn.Module):
+    def __init__(self, num_nodes, embed_dim, hidden_dim):
+        super().__init__()
+        self.manba1 = MANBA_Block(hidden_dim, hidden_dim * 2)
+        self.graph_constructor = DynamicGraphConstructor(num_nodes, embed_dim)
+        self.gc = GraphConvBlock(hidden_dim, hidden_dim)
+        self.manba2 = MANBA_Block(hidden_dim, hidden_dim * 2)
+
+    def forward(self, h):
+        h1 = self.manba1(h)
+        adj = self.graph_constructor()
+        h2 = self.gc(h1, adj)
+        h3 = self.manba2(h2)
+        return h3
+
+
+class MLP(nn.Module):
+    def __init__(self, in_dim, hidden_dims, out_dim, activation=nn.ReLU):
+        super().__init__()
+        dims = [in_dim] + hidden_dims + [out_dim]
+        layers = []
+        for i in range(len(dims) - 2):
+            layers.append(nn.Linear(dims[i], dims[i+1]))
+            layers.append(activation())
+        layers.append(nn.Linear(dims[-2], dims[-1]))
+        self.net = nn.Sequential(*layers)
+
+    def forward(self, x):
+        return self.net(x)
+
+
+class EXP(nn.Module):
+    def __init__(self, args):
+        super().__init__()
+        self.horizon     = args['horizon']
+        self.output_dim  = args['output_dim']
+        self.seq_len     = args.get('in_len', 12)
+        self.hidden_dim  = args.get('hidden_dim', 64)
+        self.num_nodes   = args['num_nodes']
+        self.embed_dim   = args.get('embed_dim', 16)
+
+        # discrete time embeddings
+        self.time_slots    = args.get('time_slots', 24 * 60 // args.get('time_slot', 5))
+        self.time_embedding = nn.Embedding(self.time_slots, self.hidden_dim)
+        self.day_embedding  = nn.Embedding(7, self.hidden_dim)
+
+        # input projection for flow history
+        self.input_proj = MLP(
+            in_dim      = self.seq_len,
+            hidden_dims = [self.hidden_dim],
+            out_dim     = self.hidden_dim
+        )
+
+        # two Sandwich blocks
+        self.sandwich1 = SandwichBlock(self.num_nodes, self.embed_dim, self.hidden_dim)
+        self.sandwich2 = SandwichBlock(self.num_nodes, self.embed_dim, self.hidden_dim)
+
+        # output projection
+        self.out_proj = MLP(
+            in_dim      = self.hidden_dim,
+            hidden_dims = [2 * self.hidden_dim],
+            out_dim     = self.horizon * self.output_dim
+        )
+
+    def forward(self, x):
+        """
+        x: (B, T, N, D_total) where
+           x[...,0]=flow, x[...,1]=time_in_day (scaled), x[...,2]=day_in_week
+        """
+        x_flow = x[..., 0]  # (B, T, N)
+        x_time = x[..., 1]  # (B, T, N)
+        x_day  = x[..., 2]  # (B, T, N)
+
+        B, T, N = x_flow.shape
+        assert T == self.seq_len, "Input sequence length mismatch"
+
+        # project flow history
+        x_flat = x_flow.permute(0, 2, 1).reshape(B * N, T)
+        h0 = self.input_proj(x_flat).view(B, N, self.hidden_dim)
+
+        # time embeddings at last step
+        t_idx = (x_time[:, -1, :] * (self.time_slots - 1)).long()
+        d_idx = x_day[:,  -1, :].long()
+        time_emb = self.time_embedding(t_idx)
+        day_emb  = self.day_embedding(d_idx)
+
+        # inject time features
+        h0 = h0 + time_emb + day_emb
+
+        # Sandwich + residuals
+        h1 = self.sandwich1(h0) + h0
+        h2 = self.sandwich2(h1)
+
+        # output
+        out = self.out_proj(h2)
+        out = out.view(B, N, self.horizon, self.output_dim)
+        out = out.permute(0, 2, 1, 3)
+        return out
+
+# Example usage:
+# args = {'horizon':12, 'output_dim':1, 'num_nodes':170}
+# model = EXP(args)
+# print(model)
--- a/model/EXP/EXP26.py
+++ b/model/EXP/EXP26.py
@ -0,0 +1,195 @@
+import torch
+import torch.nn as nn
+import torch.nn.functional as F
+
+"""
+添加时间嵌入 + 引入图注意力网络（GAT）
+"""
+
+class DynamicGraphConstructor(nn.Module):
+    def __init__(self, node_num, embed_dim):
+        super().__init__()
+        self.nodevec1 = nn.Parameter(torch.randn(node_num, embed_dim), requires_grad=True)
+        self.nodevec2 = nn.Parameter(torch.randn(node_num, embed_dim), requires_grad=True)
+
+    def forward(self):
+        adj = torch.matmul(self.nodevec1, self.nodevec2.T)
+        adj = F.relu(adj)
+        adj = F.softmax(adj, dim=-1)
+        return adj
+
+
+# 原来的 GCN 块保留备用
+class GraphConvBlock(nn.Module):
+    def __init__(self, input_dim, output_dim):
+        super().__init__()
+        self.theta = nn.Linear(input_dim, output_dim)
+        self.residual = (input_dim == output_dim)
+        if not self.residual:
+            self.res_proj = nn.Linear(input_dim, output_dim)
+
+    def forward(self, x, adj):
+        res = x
+        x = torch.matmul(adj, x)
+        x = self.theta(x)
+        x = x + (res if self.residual else self.res_proj(res))
+        return F.relu(x)
+
+
+# ★★ GAT 部分：从 LeronQ/GCN_predict-Pytorch 改写而来 ★★
+class GraphAttentionLayer(nn.Module):
+    def __init__(self, in_c, out_c):
+        super().__init__()
+        self.W = nn.Linear(in_c, out_c, bias=False)
+        self.b = nn.Parameter(torch.Tensor(out_c))
+        nn.init.xavier_uniform_(self.W.weight)
+        nn.init.zeros_(self.b)
+
+    def forward(self, h, adj):
+        # h: [B, N, C_in], adj: [N, N]
+        Wh = self.W(h)  # [B, N, C_out]
+        # 计算注意力得分
+        score = torch.bmm(Wh, Wh.transpose(1, 2)) * adj.unsqueeze(0)  # [B, N, N]
+        score = score.masked_fill(score == 0, -1e16)
+        alpha = F.softmax(score, dim=-1)  # [B, N, N]
+        # 加权求和并加偏置
+        out = torch.bmm(alpha, Wh) + self.b  # [B, N, C_out]
+        return F.relu(out)
+
+class GraphAttentionBlock(nn.Module):
+    def __init__(self, input_dim, output_dim, n_heads=4):
+        super().__init__()
+        # 多头注意力
+        self.heads = nn.ModuleList([GraphAttentionLayer(input_dim, output_dim) for _ in range(n_heads)])
+        # 合并后再做一次线性映射
+        self.out_att = GraphAttentionLayer(output_dim * n_heads, output_dim)
+        self.act = nn.ReLU()
+
+    def forward(self, x, adj):
+        # x: [B, N, C], adj: [N, N]
+        # 并行多头，然后拼接
+        h_cat = torch.cat([head(x, adj) for head in self.heads], dim=-1)  # [B, N, output_dim * n_heads]
+        h_out = self.out_att(h_cat, adj)  # [B, N, output_dim]
+        return self.act(h_out)
+
+
+class MANBA_Block(nn.Module):
+    def __init__(self, input_dim, hidden_dim):
+        super().__init__()
+        self.attn = nn.MultiheadAttention(embed_dim=input_dim, num_heads=4, batch_first=True)
+        self.ffn = nn.Sequential(
+            nn.Linear(input_dim, hidden_dim),
+            nn.ReLU(),
+            nn.Linear(hidden_dim, input_dim)
+        )
+        self.norm1 = nn.LayerNorm(input_dim)
+        self.norm2 = nn.LayerNorm(input_dim)
+
+    def forward(self, x):
+        res = x
+        x_attn, _ = self.attn(x, x, x)
+        x = self.norm1(res + x_attn)
+        res2 = x
+        x_ffn = self.ffn(x)
+        x = self.norm2(res2 + x_ffn)
+        return x
+
+
+class SandwichBlock(nn.Module):
+    def __init__(self, num_nodes, embed_dim, hidden_dim):
+        super().__init__()
+        self.manba1 = MANBA_Block(hidden_dim, hidden_dim * 2)
+        self.graph_constructor = DynamicGraphConstructor(num_nodes, embed_dim)
+        # ★★ 替换为 GATBlock ★★
+        self.gc = GraphAttentionBlock(hidden_dim, hidden_dim, n_heads=4)
+        self.manba2 = MANBA_Block(hidden_dim, hidden_dim * 2)
+
+    def forward(self, h):
+        h1 = self.manba1(h)
+        adj = self.graph_constructor()
+        h2 = self.gc(h1, adj)
+        h3 = self.manba2(h2)
+        return h3
+
+
+class MLP(nn.Module):
+    def __init__(self, in_dim, hidden_dims, out_dim, activation=nn.ReLU):
+        super().__init__()
+        dims = [in_dim] + hidden_dims + [out_dim]
+        layers = []
+        for i in range(len(dims)-2):
+            layers += [nn.Linear(dims[i], dims[i+1]), activation()]
+        layers += [nn.Linear(dims[-2], dims[-1])]
+        self.net = nn.Sequential(*layers)
+
+    def forward(self, x):
+        return self.net(x)
+
+
+class EXP(nn.Module):
+    def __init__(self, args):
+        super().__init__()
+        self.horizon     = args['horizon']
+        self.output_dim  = args['output_dim']
+        self.seq_len     = args.get('in_len', 12)
+        self.hidden_dim  = args.get('hidden_dim', 64)
+        self.num_nodes   = args['num_nodes']
+        self.embed_dim   = args.get('embed_dim', 16)
+
+        # ==== 新增：离散时间嵌入 ====
+        self.time_slots = args.get('time_slots', 24 * 60 // args.get('time_slot', 5))
+        self.time_embedding = nn.Embedding(self.time_slots, self.hidden_dim)
+        self.day_embedding  = nn.Embedding(7, self.hidden_dim)
+
+        # 输入投影（仅 flow）
+        self.input_proj = MLP(
+            in_dim      = self.seq_len,
+            hidden_dims = [self.hidden_dim],
+            out_dim     = self.hidden_dim
+        )
+
+        # 两个 SandwichBlock（已替换为 GAT）
+        self.sandwich1 = SandwichBlock(self.num_nodes, self.embed_dim, self.hidden_dim)
+        self.sandwich2 = SandwichBlock(self.num_nodes, self.embed_dim, self.hidden_dim)
+
+        # 输出投影
+        self.out_proj = MLP(
+            in_dim      = self.hidden_dim,
+            hidden_dims = [2 * self.hidden_dim],
+            out_dim     = self.horizon * self.output_dim
+        )
+
+    def forward(self, x):
+        """
+        x: (B, T, N, D_total)
+           D_total >= 3, x[...,0]=flow, x[...,1]=time_in_day, x[...,2]=day_in_week
+        """
+        x_flow = x[..., 0]  # (B, T, N)
+        x_time = x[..., 1]  # (B, T, N)
+        x_day  = x[..., 2]  # (B, T, N)
+
+        B, T, N = x_flow.shape
+        assert T == self.seq_len
+
+        # 1) 投影流量历史
+        x_flat = x_flow.permute(0, 2, 1).reshape(B * N, T)
+        h0 = self.input_proj(x_flat).view(B, N, self.hidden_dim)
+
+        # 2) 取最后一步的时间索引并嵌入
+        t_idx = (x_time[:, -1, :,] * (self.time_slots - 1)).long()
+        d_idx = x_day[:,  -1, :,].long()
+        time_emb = self.time_embedding(t_idx)
+        day_emb  = self.day_embedding(d_idx)
+
+        # 3) 注入时间信息
+        h0 = h0 + time_emb + day_emb
+
+        # 4) Sandwich + 残差
+        h1 = self.sandwich1(h0)
+        h1 = h1 + h0
+        h2 = self.sandwich2(h1)
+
+        # 5) 输出
+        out = self.out_proj(h2)
+        out = out.view(B, N, self.horizon, self.output_dim).permute(0, 2, 1, 3)
+        return out
--- a/model/EXP/EXP27.py
+++ b/model/EXP/EXP27.py
@ -0,0 +1,170 @@
+import torch
+import torch.nn as nn
+import torch.nn.functional as F
+
+
+class MANBA_Block(nn.Module):
+    def __init__(self, input_dim, hidden_dim):
+        super().__init__()
+        self.attn = nn.MultiheadAttention(embed_dim=input_dim, num_heads=4, batch_first=True)
+        self.ffn = nn.Sequential(
+            nn.Linear(input_dim, hidden_dim),
+            nn.ReLU(),
+            nn.Linear(hidden_dim, input_dim)
+        )
+        self.norm1 = nn.LayerNorm(input_dim)
+        self.norm2 = nn.LayerNorm(input_dim)
+
+    def forward(self, x):
+        # x: (B, N, input_dim)
+        res = x
+        x_attn, _ = self.attn(x, x, x)
+        x = self.norm1(res + x_attn)
+        res2 = x
+        x_ffn = self.ffn(x)
+        x = self.norm2(res2 + x_ffn)
+        return x
+
+
+class ExpertBlock(nn.Module):
+    """
+    Mixture-of-Experts block: routes each node's representation to a selected expert or a shared expert.
+    """
+    def __init__(self, hidden_dim, num_experts):
+        super().__init__()
+        self.num_experts = num_experts
+        # gating network projects to num_experts + 1 (extra shared expert)
+        self.gate = nn.Linear(hidden_dim, num_experts + 1)
+        # per-expert FFNs
+        self.experts = nn.ModuleList([
+            nn.Sequential(
+                nn.Linear(hidden_dim, hidden_dim * 2),
+                nn.ReLU(),
+                nn.Linear(hidden_dim * 2, hidden_dim)
+            ) for _ in range(num_experts)
+        ])
+        # shared expert
+        self.shared_expert = nn.Sequential(
+            nn.Linear(hidden_dim, hidden_dim * 2),
+            nn.ReLU(),
+            nn.Linear(hidden_dim * 2, hidden_dim)
+        )
+
+    def forward(self, x):
+        # x: (B, N, hidden_dim)
+        B, N, D = x.shape
+        # flatten to (B*N, D)
+        flat = x.view(B * N, D)
+        # compute gating scores and select expert per node
+        scores = F.softmax(self.gate(flat), dim=-1)  # (B*N, num_experts+1)
+        idx = scores.argmax(dim=-1)                  # (B*N,)
+
+        out_flat = torch.zeros_like(flat)
+        # apply each expert
+        for e in range(self.num_experts):
+            mask = (idx == e)
+            if mask.any():
+                out_flat[mask] = self.experts[e](flat[mask])
+        # apply shared expert for last index
+        shared_mask = (idx == self.num_experts)
+        if shared_mask.any():
+            out_flat[shared_mask] = self.shared_expert(flat[shared_mask])
+
+        # reshape back to (B, N, D)
+        return out_flat.view(B, N, D)
+
+
+class MLP(nn.Module):
+    def __init__(self, in_dim, hidden_dims, out_dim, activation=nn.ReLU):
+        super().__init__()
+        dims = [in_dim] + hidden_dims + [out_dim]
+        layers = []
+        for i in range(len(dims) - 2):
+            layers += [nn.Linear(dims[i], dims[i+1]), activation()]
+        layers += [nn.Linear(dims[-2], dims[-1])]
+        self.net = nn.Sequential(*layers)
+
+    def forward(self, x):
+        return self.net(x)
+
+
+class SandwichBlock(nn.Module):
+    def __init__(self, num_nodes, embed_dim, hidden_dim, num_experts):
+        super().__init__()
+        self.manba1 = MANBA_Block(hidden_dim, hidden_dim * 2)
+        self.expert_block = ExpertBlock(hidden_dim, num_experts)
+        self.manba2 = MANBA_Block(hidden_dim, hidden_dim * 2)
+
+    def forward(self, h):
+        h1 = self.manba1(h)
+        h2 = self.expert_block(h1)
+        h3 = self.manba2(h2)
+        return h3
+
+
+class EXP(nn.Module):
+    def __init__(self, args):
+        super().__init__()
+        self.horizon     = args['horizon']
+        self.output_dim  = args['output_dim']
+        self.seq_len     = args.get('in_len', 12)
+        self.hidden_dim  = args.get('hidden_dim', 64)
+        self.num_nodes   = args['num_nodes']
+        self.embed_dim   = args.get('embed_dim', 16)
+        self.num_experts = args.get('num_experts', 8)  # number of private experts
+
+        # discrete time embeddings
+        self.time_slots    = args.get('time_slots', 24 * 60 // args.get('time_slot', 5))
+        self.time_embedding = nn.Embedding(self.time_slots, self.hidden_dim)
+        self.day_embedding  = nn.Embedding(7, self.hidden_dim)
+
+        # input projection
+        self.input_proj = MLP(
+            in_dim      = self.seq_len,
+            hidden_dims = [self.hidden_dim],
+            out_dim     = self.hidden_dim
+        )
+
+        # two Sandwich blocks with MoE
+        self.sandwich1 = SandwichBlock(self.num_nodes, self.embed_dim, self.hidden_dim, self.num_experts)
+        self.sandwich2 = SandwichBlock(self.num_nodes, self.embed_dim, self.hidden_dim, self.num_experts)
+
+        # output projection
+        self.out_proj = MLP(
+            in_dim      = self.hidden_dim,
+            hidden_dims = [2 * self.hidden_dim],
+            out_dim     = self.horizon * self.output_dim
+        )
+
+    def forward(self, x):
+        """
+        x: (B, T, N, D_total)
+           x[...,0]= flow, x[...,1]=time_in_day, x[...,2]=day_in_week
+        """
+        x_flow = x[..., 0]
+        x_time = x[..., 1]
+        x_day  = x[..., 2]
+
+        B, T, N = x_flow.shape
+        assert T == self.seq_len
+
+        # project flow history
+        x_flat = x_flow.permute(0, 2, 1).reshape(B * N, T)
+        h0 = self.input_proj(x_flat).view(B, N, self.hidden_dim)
+
+        # time & day embeddings at last step
+        t_idx = (x_time[:, -1, :,] * (self.time_slots - 1)).long()
+        d_idx = x_day[:,  -1, :,].long()
+        time_emb = self.time_embedding(t_idx)
+        day_emb  = self.day_embedding(d_idx)
+        h0 = h0 + time_emb + day_emb
+
+        # two MoE Sandwich blocks + residuals
+        h1 = self.sandwich1(h0) + h0
+        h2 = self.sandwich2(h1) + h1
+
+        # output
+        out = self.out_proj(h2)
+        out = out.view(B, N, self.horizon, self.output_dim)
+        out = out.permute(0, 2, 1, 3)
+        return out
--- a/model/EXP/EXP8b.py
+++ b/model/EXP/EXP8b.py
@ -0,0 +1,133 @@
+import torch
+import torch.nn as nn
+import torch.nn.functional as F
+
+"""
+含残差版本
+"""
+
+class DynamicGraphConstructor(nn.Module):
+    def __init__(self, node_num, embed_dim):
+        super().__init__()
+        self.nodevec1 = nn.Parameter(torch.randn(node_num, embed_dim), requires_grad=True)
+        self.nodevec2 = nn.Parameter(torch.randn(node_num, embed_dim), requires_grad=True)
+
+    def forward(self):
+        # (N, D) @ (D, N) -> (N, N)
+        adj = torch.matmul(self.nodevec1, self.nodevec2.T)
+        adj = F.relu(adj)
+        adj = F.softmax(adj, dim=-1)
+        return adj
+
+
+class GraphConvBlock(nn.Module):
+    def __init__(self, input_dim, output_dim):
+        super().__init__()
+        self.theta = nn.Linear(input_dim, output_dim)
+        self.residual = input_dim == output_dim
+        if not self.residual:
+            self.res_proj = nn.Linear(input_dim, output_dim)
+
+    def forward(self, x, adj):
+        # x: (B, N, C) / adj: (N, N)
+        res = x
+        x = torch.matmul(adj, x)  # (B, N, C)
+        x = self.theta(x)
+
+        # 残差连接
+        if self.residual:
+            x = x + res
+        else:
+            x = x + self.res_proj(res)
+
+        return F.relu(x)
+
+
+class MANBA_Block(nn.Module):
+    def __init__(self, input_dim, hidden_dim):
+        super().__init__()
+        self.attn = nn.MultiheadAttention(embed_dim=input_dim, num_heads=4, batch_first=True)
+        self.ffn = nn.Sequential(
+            nn.Linear(input_dim, hidden_dim),
+            nn.ReLU(),
+            nn.Linear(hidden_dim, input_dim)
+        )
+        self.norm1 = nn.LayerNorm(input_dim)
+        self.norm2 = nn.LayerNorm(input_dim)
+
+    def forward(self, x):
+        # x: (B, T, C)
+        res = x
+        x_attn, _ = self.attn(x, x, x)
+        x = self.norm1(res + x_attn)
+
+        res2 = x
+        x_ffn = self.ffn(x)
+        x = self.norm2(res2 + x_ffn)
+
+        return x
+
+
+class EXP(nn.Module):
+    def __init__(self, args):
+        super().__init__()
+        self.horizon = args['horizon']
+        self.output_dim = args['output_dim']
+        self.seq_len = args.get('in_len', 12)
+        self.hidden_dim = args.get('hidden_dim', 64)
+        self.num_nodes = args['num_nodes']
+
+        self.time_slots = args.get('time_slots', 24 * 60 // args.get('time_slot', 5))
+        self.time_embedding = nn.Embedding(self.time_slots, self.hidden_dim)
+        self.day_embedding  = nn.Embedding(7, self.hidden_dim)
+
+        # 动态图构建
+        self.graph = DynamicGraphConstructor(self.num_nodes, embed_dim=16)
+
+        # 输入映射层
+        self.input_proj = nn.Linear(self.seq_len, self.hidden_dim)
+
+        # 图卷积
+        self.gc = GraphConvBlock(self.hidden_dim, self.hidden_dim)
+
+        # MANBA block
+        self.manba = MANBA_Block(self.hidden_dim, self.hidden_dim * 2)
+
+        # 输出映射
+        self.out_proj = nn.Linear(self.hidden_dim, self.horizon * self.output_dim)
+
+    def forward(self, x):
+        # x: (B, T, N, D_total)
+        x_time = x[..., 1]  # (B, T, N)
+        x_day  = x[..., 2]  # (B, T, N)
+        x = x[..., 0]  # 只用主通道 (B, T, N)
+        B, T, N = x.shape
+        assert T == self.seq_len
+
+        # 输入投影 (B, T, N) -> (B, N, T) -> (B*N, T) -> (B*N, H)
+        x = x.permute(0, 2, 1).reshape(B * N, T)
+        h = self.input_proj(x)  # (B*N, hidden_dim)
+        h = h.view(B, N, self.hidden_dim)
+
+        t_idx = (x_time[:, -1, :,] * (self.time_slots - 1)).long()  # (B, N)
+        d_idx = x_day[:,  -1, :,].long()                            # (B, N)
+
+        time_emb = self.time_embedding(t_idx)   # (B, N, hidden_dim)
+        day_emb  = self.day_embedding(d_idx)    # (B, N, hidden_dim)
+
+        # 3) inject them into the initial hidden state
+        h = h + time_emb + day_emb
+
+        # 动态图构建
+        adj = self.graph()  # (N, N)
+
+        # 空间建模：图卷积
+        h = self.gc(h, adj)  # (B, N, hidden_dim)
+
+        # 时间建模：MANBA
+        h = self.manba(h)  # (B, N, hidden_dim)
+
+        # 输出映射
+        out = self.out_proj(h)  # (B, N, horizon * output_dim)
+        out = out.view(B, N, self.horizon, self.output_dim).permute(0, 2, 1, 3)
+        return out  # (B, horizon, N, output_dim)
--- a/model/STID/MLP.py
+++ b/model/STID/MLP.py
@ -0,0 +1,29 @@
+import torch
+from torch import nn
+
+
+class MultiLayerPerceptron(nn.Module):
+    """Multi-Layer Perceptron with residual links."""
+
+    def __init__(self, input_dim, hidden_dim) -> None:
+        super().__init__()
+        self.fc1 = nn.Conv2d(
+            in_channels=input_dim,  out_channels=hidden_dim, kernel_size=(1, 1), bias=True)
+        self.fc2 = nn.Conv2d(
+            in_channels=hidden_dim, out_channels=hidden_dim, kernel_size=(1, 1), bias=True)
+        self.act = nn.ReLU()
+        self.drop = nn.Dropout(p=0.15)
+
+    def forward(self, input_data: torch.Tensor) -> torch.Tensor:
+        """Feed forward of MLP.
+
+        Args:
+            input_data (torch.Tensor): input data with shape [B, D, N]
+
+        Returns:
+            torch.Tensor: latent repr
+        """
+
+        hidden = self.fc2(self.drop(self.act(self.fc1(input_data))))      # MLP
+        hidden = hidden + input_data                           # residual
+        return hidden
--- a/model/STID/STID.py
+++ b/model/STID/STID.py
@ -0,0 +1,117 @@
+import torch
+from torch import nn
+
+from model.STID.MLP import MultiLayerPerceptron
+
+
+class STID(nn.Module):
+    """
+    Paper: Spatial-Temporal Identity: A Simple yet Effective Baseline for Multivariate Time Series Forecasting
+    Link: https://arxiv.org/abs/2208.05233
+    Official Code: https://github.com/zezhishao/STID
+    """
+
+    def __init__(self, model_args):
+        super().__init__()
+        # attributes
+        self.num_nodes = model_args["num_nodes"]
+        self.node_dim = model_args["node_dim"]
+        self.input_len = model_args["input_len"]
+        self.input_dim = model_args["input_dim"]
+        self.embed_dim = model_args["embed_dim"]
+        self.output_len = model_args["output_len"]
+        self.num_layer = model_args["num_layer"]
+        self.temp_dim_tid = model_args["temp_dim_tid"]
+        self.temp_dim_diw = model_args["temp_dim_diw"]
+        self.time_of_day_size = model_args["time_of_day_size"]
+        self.day_of_week_size = model_args["day_of_week_size"]
+
+        self.if_time_in_day = model_args["if_T_i_D"]
+        self.if_day_in_week = model_args["if_D_i_W"]
+        self.if_spatial = model_args["if_node"]
+
+        # spatial embeddings
+        if self.if_spatial:
+            self.node_emb = nn.Parameter(torch.empty(self.num_nodes, self.node_dim))
+            nn.init.xavier_uniform_(self.node_emb)
+        # temporal embeddings
+        if self.if_time_in_day:
+            self.time_in_day_emb = nn.Parameter(
+                torch.empty(self.time_of_day_size, self.temp_dim_tid))
+            nn.init.xavier_uniform_(self.time_in_day_emb)
+        if self.if_day_in_week:
+            self.day_in_week_emb = nn.Parameter(
+                torch.empty(self.day_of_week_size, self.temp_dim_diw))
+            nn.init.xavier_uniform_(self.day_in_week_emb)
+
+        # embedding layer
+        self.time_series_emb_layer = nn.Conv2d(
+            in_channels=self.input_dim * self.input_len, out_channels=self.embed_dim, kernel_size=(1, 1), bias=True)
+
+        # encoding
+        self.hidden_dim = self.embed_dim+self.node_dim * \
+            int(self.if_spatial)+self.temp_dim_tid*int(self.if_day_in_week) + \
+            self.temp_dim_diw*int(self.if_time_in_day)
+        self.encoder = nn.Sequential(
+            *[MultiLayerPerceptron(self.hidden_dim, self.hidden_dim) for _ in range(self.num_layer)])
+
+        # regression
+        self.regression_layer = nn.Conv2d(
+            in_channels=self.hidden_dim, out_channels=self.output_len, kernel_size=(1, 1), bias=True)
+
+    def forward(self, history_data: torch.Tensor) -> torch.Tensor:
+        """Feed forward of STID.
+
+        Args:
+            history_data (torch.Tensor): history data with shape [B, L, N, C]
+
+        Returns:
+            torch.Tensor: prediction with shape [B, L, N, C]
+        """
+
+        # prepare data
+        input_data = history_data[..., range(self.input_dim)]
+        # input_data = history_data[..., 0:1]
+
+        if self.if_time_in_day:
+            t_i_d_data = history_data[..., 1]
+            # In the datasets used in STID, the time_of_day feature is normalized to [0, 1]. We multiply it by 288 to get the index.
+            # If you use other datasets, you may need to change this line.
+            time_in_day_emb = self.time_in_day_emb[(t_i_d_data[:, -1, :] * self.time_of_day_size).type(torch.LongTensor)]
+        else:
+            time_in_day_emb = None
+        if self.if_day_in_week:
+            d_i_w_data = history_data[..., 2]
+            day_in_week_emb = self.day_in_week_emb[(d_i_w_data[:, -1, :] * self.day_of_week_size).type(torch.LongTensor)]
+        else:
+            day_in_week_emb = None
+
+        # time series embedding
+        batch_size, _, num_nodes, _ = input_data.shape
+        input_data = input_data.transpose(1, 2).contiguous()
+        input_data = input_data.view(
+            batch_size, num_nodes, -1).transpose(1, 2).unsqueeze(-1)
+        time_series_emb = self.time_series_emb_layer(input_data)
+
+        node_emb = []
+        if self.if_spatial:
+            # expand node embeddings
+            node_emb.append(self.node_emb.unsqueeze(0).expand(
+                batch_size, -1, -1).transpose(1, 2).unsqueeze(-1))
+        # temporal embeddings
+        tem_emb = []
+        if time_in_day_emb is not None:
+            tem_emb.append(time_in_day_emb.transpose(1, 2).unsqueeze(-1))
+        if day_in_week_emb is not None:
+            tem_emb.append(day_in_week_emb.transpose(1, 2).unsqueeze(-1))
+
+        # concate all embeddings
+        hidden = torch.cat([time_series_emb] + node_emb + tem_emb, dim=1)
+
+        # encoding
+        hidden = self.encoder(hidden)
+
+        # regression
+        prediction = self.regression_layer(hidden)
+
+        return prediction
--- a/model/model_selector.py
+++ b/model/model_selector.py
@ -13,7 +13,8 @@ from model.STFGNN.STFGNN import STFGNN
 from model.STSGCN.STSGCN import STSGCN
 from model.STGODE.STGODE import ODEGCN
 from model.PDG2SEQ.PDG2Seq import PDG2Seq
-from model.EXP.EXP21 import EXP as EXP
+from model.STID.STID import STID
+from model.EXP.EXP26 import EXP as EXP

 def model_selector(model):
    match model['type']:
@ -32,5 +33,6 @@ def model_selector(model):
        case 'STSGCN': return STSGCN(model)
        case 'STGODE': return ODEGCN(model)
        case 'PDG2SEQ': return PDG2Seq(model)
+        case 'STID': return STID(model)
        case 'EXP': return EXP(model)