diff --git a/Informer/Informer2020 b/Informer/Informer2020 new file mode 160000 index 0000000..29f2a73 --- /dev/null +++ b/Informer/Informer2020 @@ -0,0 +1 @@ +Subproject commit 29f2a739226a509202a092b464163da81fa74960 diff --git a/config/Informer/METR-LA.yaml b/config/Informer/METR-LA.yaml index aa0176e..de03c51 100644 --- a/config/Informer/METR-LA.yaml +++ b/config/Informer/METR-LA.yaml @@ -6,50 +6,47 @@ basic: seed: 2023 data: - batch_size: 16 + batch_size: 64 column_wise: false days_per_week: 7 horizon: 24 input_dim: 1 lag: 24 - label_len: 24 normalizer: std - num_nodes: 207 - steps_per_day: 288 + num_nodes: 137 + steps_per_day: 24 test_ratio: 0.2 val_ratio: 0.2 model: activation: gelu seq_len: 24 - label_len: 24 - pred_len: 24 - d_model: 128 - d_ff: 2048 - dropout: 0.1 - e_layers: 2 - d_layers: 1 - n_heads: 8 - output_attention: False - factor: 5 - attn: prob - embed: fixed - freq: h - distil: true - mix: true + label_len: 12 + out_len: 24 enc_in: 1 dec_in: 1 c_out: 1 - + d_model: 64 + d_ff: 512 + dropout: 0.1 + e_layers: 1 + d_layers: 1 + n_heads: 4 + factor: 5 + output_attention: False + distil: True + mix: True + attn: prob + embed: fixed + freq: h train: - batch_size: 16 + batch_size: 64 debug: false early_stop: true early_stop_patience: 15 - epochs: 100 + epochs: 1 grad_norm: false - label_len: 24 log_step: 1000 loss_func: mae lr_decay: true @@ -61,6 +58,5 @@ train: max_grad_norm: 5 output_dim: 1 plot: false - pred_len: 24 real_value: true weight_decay: 0 \ No newline at end of file diff --git a/config/Informer/NYCBike-InFlow.yaml b/config/Informer/NYCBike-InFlow.yaml index 4cc3fc7..e378425 100644 --- a/config/Informer/NYCBike-InFlow.yaml +++ b/config/Informer/NYCBike-InFlow.yaml @@ -6,16 +6,15 @@ basic: seed: 2023 data: - batch_size: 16 + batch_size: 64 column_wise: false days_per_week: 7 horizon: 24 input_dim: 1 lag: 24 - label_len: 24 normalizer: std - num_nodes: 128 - steps_per_day: 48 + num_nodes: 137 + steps_per_day: 24 test_ratio: 0.2 val_ratio: 0.2 @@ -23,33 +22,31 @@ model: activation: gelu seq_len: 24 label_len: 12 - pred_len: 24 - d_model: 128 - d_ff: 2048 - dropout: 0.1 - e_layers: 2 - d_layers: 1 - n_heads: 8 - output_attention: False - factor: 5 - attn: prob - embed: fixed - freq: h - distil: true - mix: true + out_len: 24 enc_in: 1 dec_in: 1 c_out: 1 - + d_model: 64 + d_ff: 512 + dropout: 0.1 + e_layers: 1 + d_layers: 1 + n_heads: 4 + factor: 5 + output_attention: False + distil: True + mix: True + attn: prob + embed: fixed + freq: h train: - batch_size: 16 + batch_size: 64 debug: false early_stop: true early_stop_patience: 15 - epochs: 100 + epochs: 1 grad_norm: false - label_len: 24 log_step: 1000 loss_func: mae lr_decay: true @@ -61,6 +58,5 @@ train: max_grad_norm: 5 output_dim: 1 plot: false - pred_len: 24 real_value: true weight_decay: 0 \ No newline at end of file diff --git a/config/Informer/NYCBike-OutFlow.yaml b/config/Informer/NYCBike-OutFlow.yaml index c46c7b4..05b73c6 100644 --- a/config/Informer/NYCBike-OutFlow.yaml +++ b/config/Informer/NYCBike-OutFlow.yaml @@ -6,16 +6,15 @@ basic: seed: 2023 data: - batch_size: 16 + batch_size: 64 column_wise: false days_per_week: 7 horizon: 24 input_dim: 1 lag: 24 - label_len: 24 normalizer: std - num_nodes: 128 - steps_per_day: 48 + num_nodes: 137 + steps_per_day: 24 test_ratio: 0.2 val_ratio: 0.2 @@ -23,33 +22,31 @@ model: activation: gelu seq_len: 24 label_len: 12 - pred_len: 24 - d_model: 128 - d_ff: 2048 - dropout: 0.1 - e_layers: 2 - d_layers: 1 - n_heads: 8 - output_attention: False - factor: 5 - attn: prob - embed: fixed - freq: h - distil: true - mix: true + out_len: 24 enc_in: 1 dec_in: 1 c_out: 1 - + d_model: 64 + d_ff: 512 + dropout: 0.1 + e_layers: 1 + d_layers: 1 + n_heads: 4 + factor: 5 + output_attention: False + distil: True + mix: True + attn: prob + embed: fixed + freq: h train: - batch_size: 16 + batch_size: 64 debug: false early_stop: true early_stop_patience: 15 - epochs: 100 + epochs: 1 grad_norm: false - label_len: 24 log_step: 1000 loss_func: mae lr_decay: true @@ -61,6 +58,5 @@ train: max_grad_norm: 5 output_dim: 1 plot: false - pred_len: 24 real_value: true weight_decay: 0 \ No newline at end of file diff --git a/config/Informer/SolarEnergy.yaml b/config/Informer/SolarEnergy.yaml index 570c595..63492a2 100644 --- a/config/Informer/SolarEnergy.yaml +++ b/config/Informer/SolarEnergy.yaml @@ -17,28 +17,30 @@ data: steps_per_day: 24 test_ratio: 0.2 val_ratio: 0.2 - + model: activation: gelu seq_len: 24 label_len: 12 - pred_len: 24 - d_model: 128 - d_ff: 2048 - dropout: 0.1 - e_layers: 2 - d_layers: 1 - n_heads: 8 - output_attention: False - factor: 5 - attn: prob - embed: fixed - freq: h - distil: true - mix: true + out_len: 24 enc_in: 1 dec_in: 1 c_out: 1 + d_model: 64 + d_ff: 512 + dropout: 0.1 + e_layers: 1 + d_layers: 1 + n_heads: 4 + factor: 5 + output_attention: False + distil: True + mix: True + attn: prob + embed: fixed + freq: h + + train: @@ -48,7 +50,6 @@ train: early_stop_patience: 15 epochs: 100 grad_norm: false - label_len: 24 log_step: 1000 loss_func: mae lr_decay: true @@ -60,6 +61,5 @@ train: max_grad_norm: 5 output_dim: 1 plot: false - pred_len: 24 real_value: true weight_decay: 0 \ No newline at end of file diff --git a/model/Informer/Informer_old/attn.py b/model/Informer/Informer_old/attn.py new file mode 100644 index 0000000..45344a8 --- /dev/null +++ b/model/Informer/Informer_old/attn.py @@ -0,0 +1,163 @@ +import torch +import torch.nn as nn +import torch.nn.functional as F + +import numpy as np + +from math import sqrt +from model.Informer.masking import TriangularCausalMask, ProbMask + +class FullAttention(nn.Module): + def __init__(self, mask_flag=True, factor=5, scale=None, attention_dropout=0.1, output_attention=False): + super(FullAttention, self).__init__() + self.scale = scale + self.mask_flag = mask_flag + self.output_attention = output_attention + self.dropout = nn.Dropout(attention_dropout) + + def forward(self, queries, keys, values, attn_mask): + B, L, H, E = queries.shape + _, S, _, D = values.shape + scale = self.scale or 1./sqrt(E) + + scores = torch.einsum("blhe,bshe->bhls", queries, keys) + if self.mask_flag: + if attn_mask is None: + attn_mask = TriangularCausalMask(B, L, device=queries.device) + + scores.masked_fill_(attn_mask.mask, -np.inf) + + A = self.dropout(torch.softmax(scale * scores, dim=-1)) + V = torch.einsum("bhls,bshd->blhd", A, values) + + if self.output_attention: + return (V.contiguous(), A) + else: + return (V.contiguous(), None) + +class ProbAttention(nn.Module): + def __init__(self, mask_flag=True, factor=5, scale=None, attention_dropout=0.1, output_attention=False): + super(ProbAttention, self).__init__() + self.factor = factor + self.scale = scale + self.mask_flag = mask_flag + self.output_attention = output_attention + self.dropout = nn.Dropout(attention_dropout) + + def _prob_QK(self, Q, K, sample_k, n_top): # n_top: c*ln(L_q) + # Q [B, H, L, D] + B, H, L_K, E = K.shape + _, _, L_Q, _ = Q.shape + + # calculate the sampled Q_K + K_expand = K.unsqueeze(-3).expand(B, H, L_Q, L_K, E) + index_sample = torch.randint(L_K, (L_Q, sample_k)) # real U = U_part(factor*ln(L_k))*L_q + K_sample = K_expand[:, :, torch.arange(L_Q).unsqueeze(1), index_sample, :] + Q_K_sample = torch.matmul(Q.unsqueeze(-2), K_sample.transpose(-2, -1)).squeeze(-2) + + # find the Top_k query with sparisty measurement + M = Q_K_sample.max(-1)[0] - torch.div(Q_K_sample.sum(-1), L_K) + M_top = M.topk(n_top, sorted=False)[1] + + # use the reduced Q to calculate Q_K + Q_reduce = Q[torch.arange(B)[:, None, None], + torch.arange(H)[None, :, None], + M_top, :] # factor*ln(L_q) + Q_K = torch.matmul(Q_reduce, K.transpose(-2, -1)) # factor*ln(L_q)*L_k + + return Q_K, M_top + + def _get_initial_context(self, V, L_Q): + B, H, L_V, D = V.shape + if not self.mask_flag: + # V_sum = V.sum(dim=-2) + V_sum = V.mean(dim=-2) + contex = V_sum.unsqueeze(-2).expand(B, H, L_Q, V_sum.shape[-1]).clone() + else: # use mask + assert(L_Q == L_V) # requires that L_Q == L_V, i.e. for self-attention only + contex = V.cumsum(dim=-2) + return contex + + def _update_context(self, context_in, V, scores, index, L_Q, attn_mask): + B, H, L_V, D = V.shape + + if self.mask_flag: + attn_mask = ProbMask(B, H, L_Q, index, scores, device=V.device) + scores.masked_fill_(attn_mask.mask, -np.inf) + + attn = torch.softmax(scores, dim=-1) # nn.Softmax(dim=-1)(scores) + + context_in[torch.arange(B)[:, None, None], + torch.arange(H)[None, :, None], + index, :] = torch.matmul(attn, V).type_as(context_in) + if self.output_attention: + attns = (torch.ones([B, H, L_V, L_V])/L_V).type_as(attn).to(attn.device) + attns[torch.arange(B)[:, None, None], torch.arange(H)[None, :, None], index, :] = attn + return (context_in, attns) + else: + return (context_in, None) + + def forward(self, queries, keys, values, attn_mask): + B, L_Q, H, D = queries.shape + _, L_K, _, _ = keys.shape + + queries = queries.transpose(2,1) + keys = keys.transpose(2,1) + values = values.transpose(2,1) + + U_part = self.factor * np.ceil(np.log(L_K)).astype('int').item() # c*ln(L_k) + u = self.factor * np.ceil(np.log(L_Q)).astype('int').item() # c*ln(L_q) + + U_part = U_part if U_part='1.5.0' else 2 + self.downConv = nn.Conv1d(in_channels=c_in, + out_channels=c_in, + kernel_size=3, + padding=padding, + padding_mode='circular') + self.norm = nn.BatchNorm1d(c_in) + self.activation = nn.ELU() + self.maxPool = nn.MaxPool1d(kernel_size=3, stride=2, padding=1) + + def forward(self, x): + x = self.downConv(x.permute(0, 2, 1)) + x = self.norm(x) + x = self.activation(x) + x = self.maxPool(x) + x = x.transpose(1,2) + return x + +class EncoderLayer(nn.Module): + def __init__(self, attention, d_model, d_ff=None, dropout=0.1, activation="relu"): + super(EncoderLayer, self).__init__() + d_ff = d_ff or 4*d_model + self.attention = attention + self.conv1 = nn.Conv1d(in_channels=d_model, out_channels=d_ff, kernel_size=1) + self.conv2 = nn.Conv1d(in_channels=d_ff, out_channels=d_model, kernel_size=1) + self.norm1 = nn.LayerNorm(d_model) + self.norm2 = nn.LayerNorm(d_model) + self.dropout = nn.Dropout(dropout) + self.activation = F.relu if activation == "relu" else F.gelu + + def forward(self, x, attn_mask=None): + # x [B, L, D] + # x = x + self.dropout(self.attention( + # x, x, x, + # attn_mask = attn_mask + # )) + new_x, attn = self.attention( + x, x, x, + attn_mask = attn_mask + ) + x = x + self.dropout(new_x) + + y = x = self.norm1(x) + y = self.dropout(self.activation(self.conv1(y.transpose(-1,1)))) + y = self.dropout(self.conv2(y).transpose(-1,1)) + + return self.norm2(x+y), attn + +class Encoder(nn.Module): + def __init__(self, attn_layers, conv_layers=None, norm_layer=None): + super(Encoder, self).__init__() + self.attn_layers = nn.ModuleList(attn_layers) + self.conv_layers = nn.ModuleList(conv_layers) if conv_layers is not None else None + self.norm = norm_layer + + def forward(self, x, attn_mask=None): + # x [B, L, D] + attns = [] + if self.conv_layers is not None: + for attn_layer, conv_layer in zip(self.attn_layers, self.conv_layers): + x, attn = attn_layer(x, attn_mask=attn_mask) + x = conv_layer(x) + attns.append(attn) + x, attn = self.attn_layers[-1](x, attn_mask=attn_mask) + attns.append(attn) + else: + for attn_layer in self.attn_layers: + x, attn = attn_layer(x, attn_mask=attn_mask) + attns.append(attn) + + if self.norm is not None: + x = self.norm(x) + + return x, attns + +class EncoderStack(nn.Module): + def __init__(self, encoders, inp_lens): + super(EncoderStack, self).__init__() + self.encoders = nn.ModuleList(encoders) + self.inp_lens = inp_lens + + def forward(self, x, attn_mask=None): + # x [B, L, D] + x_stack = []; attns = [] + for i_len, encoder in zip(self.inp_lens, self.encoders): + inp_len = x.shape[1]//(2**i_len) + x_s, attn = encoder(x[:, -inp_len:, :]) + x_stack.append(x_s); attns.append(attn) + x_stack = torch.cat(x_stack, -2) + + return x_stack, attns \ No newline at end of file diff --git a/model/Informer/Informer_old/head.py b/model/Informer/Informer_old/head.py new file mode 100644 index 0000000..de0978b --- /dev/null +++ b/model/Informer/Informer_old/head.py @@ -0,0 +1,25 @@ +# model/Informer/head.py +import torch +import torch.nn as nn + + +class TemporalProjectionHead(nn.Module): + """ + Project along temporal dimension + [B, L, D] -> [B, pred_len, C] + """ + + def __init__(self, d_model, pred_len, c_out): + super().__init__() + self.temporal_proj = nn.Linear(1, pred_len) + self.channel_proj = nn.Linear(d_model, c_out) + + def forward(self, x): + # x: [B, L, D] + # Average over the sequence dimension and then project + x = x.mean(dim=1, keepdim=True) # [B, 1, D] + x = x.transpose(1, 2) # [B, D, 1] + x = self.temporal_proj(x) # [B, D, pred_len] + x = x.transpose(1, 2) # [B, pred_len, D] + x = self.channel_proj(x) # [B, pred_len, C] + return x diff --git a/model/Informer/Informer_old/masking.py b/model/Informer/Informer_old/masking.py new file mode 100644 index 0000000..7fd479e --- /dev/null +++ b/model/Informer/Informer_old/masking.py @@ -0,0 +1,24 @@ +import torch + +class TriangularCausalMask(): + def __init__(self, B, L, device="cpu"): + mask_shape = [B, 1, L, L] + with torch.no_grad(): + self._mask = torch.triu(torch.ones(mask_shape, dtype=torch.bool), diagonal=1).to(device) + + @property + def mask(self): + return self._mask + +class ProbMask(): + def __init__(self, B, H, L, index, scores, device="cpu"): + _mask = torch.ones(L, scores.shape[-1], dtype=torch.bool).to(device).triu(1) + _mask_ex = _mask[None, None, :].expand(B, H, L, scores.shape[-1]) + indicator = _mask_ex[torch.arange(B)[:, None, None], + torch.arange(H)[None, :, None], + index, :].to(device) + self._mask = indicator.view(scores.shape).to(device) + + @property + def mask(self): + return self._mask \ No newline at end of file diff --git a/model/Informer/Informer_old/model.py b/model/Informer/Informer_old/model.py new file mode 100644 index 0000000..8cf072a --- /dev/null +++ b/model/Informer/Informer_old/model.py @@ -0,0 +1,48 @@ +import torch +import torch.nn as nn + +from model.Informer.encoder import Encoder, EncoderLayer, ConvLayer +from model.Informer.attn import FullAttention, ProbAttention, AttentionLayer +from model.Informer.embed import DataEmbedding +from model.Informer.head import TemporalProjectionHead + + +class InformerEncoder(nn.Module): + + def __init__(self, configs): + super().__init__() + + self.seq_len = configs["seq_len"] + self.pred_len = configs["pred_len"] + + Attn = ProbAttention if configs["attn"] == "prob" else FullAttention + + # Embedding + self.embedding = DataEmbedding(configs["enc_in"], configs["d_model"], configs["dropout"]) + + # Encoder(Attn-Conv-Norm) + self.encoder = Encoder( + [EncoderLayer( + # Attn + AttentionLayer(Attn(False, configs["factor"], configs["dropout"], False), + configs["d_model"], configs["n_heads"], False), + configs["d_model"], configs["d_ff"], configs["dropout"], configs["activation"]) + for _ in range(configs["e_layers"])], + # Conv + [ConvLayer(configs["d_model"]) for _ in range(configs["e_layers"] - 1)] + # Norm + if configs.get("distil") else None, norm_layer=nn.LayerNorm(configs["d_model"]) + ) + + # Forecast Head + self.head = TemporalProjectionHead( + d_model=configs["d_model"], + pred_len=configs["pred_len"], + c_out=configs["c_out"], + ) + + def forward(self, x_enc): + x = self.embedding(x_enc) + x, _ = self.encoder(x) + out = self.head(x) + return out[:, -self.pred_len :, :] diff --git a/model/Informer/embed.py b/model/Informer/embed.py index 5dc2b9d..2737827 100644 --- a/model/Informer/embed.py +++ b/model/Informer/embed.py @@ -1,36 +1,111 @@ -# model/InformerOnlyX/embed.py +# model/Informer/embed.py import torch import torch.nn as nn import math +class TokenEmbedding(nn.Module): + def __init__(self, c_in, d_model): + super(TokenEmbedding, self).__init__() + padding = 1 if torch.__version__>='1.5.0' else 2 + self.tokenConv = nn.Conv1d(in_channels=c_in, out_channels=d_model, + kernel_size=3, padding=padding, padding_mode='circular') + for m in self.modules(): + if isinstance(m, nn.Conv1d): + nn.init.kaiming_normal_(m.weight,mode='fan_in',nonlinearity='leaky_relu') + + def forward(self, x): + x = self.tokenConv(x.permute(0, 2, 1)).transpose(1,2) + return x class PositionalEmbedding(nn.Module): def __init__(self, d_model, max_len=5000): - super().__init__() - pe = torch.zeros(max_len, d_model) - position = torch.arange(0, max_len).unsqueeze(1).float() - div_term = torch.exp( - torch.arange(0, d_model, 2).float() * (-math.log(10000.0) / d_model) - ) + super(PositionalEmbedding, self).__init__() + # Compute the positional encodings once in log space. + pe = torch.zeros(max_len, d_model).float() + pe.require_grad = False + + position = torch.arange(0, max_len).float().unsqueeze(1) + div_term = (torch.arange(0, d_model, 2).float() * -(math.log(10000.0) / d_model)).exp() + pe[:, 0::2] = torch.sin(position * div_term) pe[:, 1::2] = torch.cos(position * div_term) - self.register_buffer("pe", pe.unsqueeze(0)) # [1, L, D] + + pe = pe.unsqueeze(0) + self.register_buffer('pe', pe) def forward(self, x): return self.pe[:, :x.size(1)] +class FixedEmbedding(nn.Module): + def __init__(self, c_in, d_model): + super(FixedEmbedding, self).__init__() -class DataEmbedding(nn.Module): - """ - Informer-style embedding without time covariates - """ + w = torch.zeros(c_in, d_model).float() + w.require_grad = False - def __init__(self, c_in, d_model, dropout): - super().__init__() - self.value_embedding = nn.Linear(c_in, d_model) - self.position_embedding = PositionalEmbedding(d_model) - self.dropout = nn.Dropout(dropout) + position = torch.arange(0, c_in).float().unsqueeze(1) + div_term = (torch.arange(0, d_model, 2).float() * -(math.log(10000.0) / d_model)).exp() + + w[:, 0::2] = torch.sin(position * div_term) + w[:, 1::2] = torch.cos(position * div_term) + + self.emb = nn.Embedding(c_in, d_model) + self.emb.weight = nn.Parameter(w, requires_grad=False) def forward(self, x): - x = self.value_embedding(x) + self.position_embedding(x) + return self.emb(x).detach() + +class TemporalEmbedding(nn.Module): + def __init__(self, d_model, embed_type='fixed', freq='h'): + super(TemporalEmbedding, self).__init__() + + minute_size = 4; hour_size = 24 + weekday_size = 7; day_size = 32; month_size = 13 + + Embed = FixedEmbedding if embed_type=='fixed' else nn.Embedding + if freq=='t': + self.minute_embed = Embed(minute_size, d_model) + self.hour_embed = Embed(hour_size, d_model) + self.weekday_embed = Embed(weekday_size, d_model) + self.day_embed = Embed(day_size, d_model) + self.month_embed = Embed(month_size, d_model) + + def forward(self, x): + x = x.long() + + minute_x = self.minute_embed(x[:,:,4]) if hasattr(self, 'minute_embed') else 0. + hour_x = self.hour_embed(x[:,:,3]) + weekday_x = self.weekday_embed(x[:,:,2]) + day_x = self.day_embed(x[:,:,1]) + month_x = self.month_embed(x[:,:,0]) + + return hour_x + weekday_x + day_x + month_x + minute_x + +class TimeFeatureEmbedding(nn.Module): + def __init__(self, d_model, embed_type='timeF', freq='h'): + super(TimeFeatureEmbedding, self).__init__() + + freq_map = {'h':4, 't':5, 's':6, 'm':1, 'a':1, 'w':2, 'd':3, 'b':3} + d_inp = freq_map[freq] + self.embed = nn.Linear(d_inp, d_model) + + def forward(self, x): + return self.embed(x) + +class DataEmbedding(nn.Module): + def __init__(self, c_in, d_model, embed_type='fixed', freq='h', dropout=0.1): + super(DataEmbedding, self).__init__() + + self.value_embedding = TokenEmbedding(c_in=c_in, d_model=d_model) + self.position_embedding = PositionalEmbedding(d_model=d_model) + self.temporal_embedding = TemporalEmbedding(d_model=d_model, embed_type=embed_type, freq=freq) if embed_type!='timeF' else TimeFeatureEmbedding(d_model=d_model, embed_type=embed_type, freq=freq) + + self.dropout = nn.Dropout(p=dropout) + + def forward(self, x, x_mark): + if x_mark is None: + x = self.value_embedding(x) + self.position_embedding(x) + else: + x = self.value_embedding(x) + self.position_embedding(x) + self.temporal_embedding(x_mark) + return self.dropout(x) diff --git a/model/Informer/model.py b/model/Informer/model.py index 8cf072a..251104b 100644 --- a/model/Informer/model.py +++ b/model/Informer/model.py @@ -1,48 +1,209 @@ import torch import torch.nn as nn +import torch.nn.functional as F -from model.Informer.encoder import Encoder, EncoderLayer, ConvLayer +from model.Informer.encoder import Encoder, EncoderLayer, ConvLayer, EncoderStack +from model.Informer.decoder import Decoder, DecoderLayer from model.Informer.attn import FullAttention, ProbAttention, AttentionLayer from model.Informer.embed import DataEmbedding -from model.Informer.head import TemporalProjectionHead - - -class InformerEncoder(nn.Module): +from model.Informer.masking import TriangularCausalMask, ProbMask +class Informer(nn.Module): def __init__(self, configs): - super().__init__() - - self.seq_len = configs["seq_len"] - self.pred_len = configs["pred_len"] - - Attn = ProbAttention if configs["attn"] == "prob" else FullAttention - - # Embedding - self.embedding = DataEmbedding(configs["enc_in"], configs["d_model"], configs["dropout"]) - - # Encoder(Attn-Conv-Norm) + super(Informer, self).__init__() + # 从configs中提取参数 + self.enc_in = configs.get("enc_in", 1) + self.dec_in = configs.get("dec_in", 1) + self.c_out = configs.get("c_out", 1) + self.seq_len = configs.get("seq_len", 96) + self.label_len = configs.get("label_len", 48) + self.out_len = configs.get("out_len", 24) + self.factor = configs.get("factor", 5) + self.d_model = configs.get("d_model", 512) + self.n_heads = configs.get("n_heads", 8) + self.e_layers = configs.get("e_layers", 3) + self.d_layers = configs.get("d_layers", 2) + self.d_ff = configs.get("d_ff", 512) + self.dropout = configs.get("dropout", 0.0) + self.attn = configs.get("attn", "prob") + self.embed = configs.get("embed", "fixed") + self.freq = configs.get("freq", "h") + self.activation = configs.get("activation", "gelu") + self.output_attention = configs.get("output_attention", False) + self.distil = configs.get("distil", True) + self.mix = configs.get("mix", True) + self.device = configs.get("device", torch.device('cuda:0')) + + self.pred_len = self.out_len + + # 编码层 + self.enc_embedding = DataEmbedding(self.enc_in, self.d_model, self.embed, self.freq, self.dropout) + self.dec_embedding = DataEmbedding(self.dec_in, self.d_model, self.embed, self.freq, self.dropout) + + # 注意力层 + Attn = ProbAttention if self.attn == 'prob' else FullAttention + + # 编码器 self.encoder = Encoder( - [EncoderLayer( - # Attn - AttentionLayer(Attn(False, configs["factor"], configs["dropout"], False), - configs["d_model"], configs["n_heads"], False), - configs["d_model"], configs["d_ff"], configs["dropout"], configs["activation"]) - for _ in range(configs["e_layers"])], - # Conv - [ConvLayer(configs["d_model"]) for _ in range(configs["e_layers"] - 1)] - # Norm - if configs.get("distil") else None, norm_layer=nn.LayerNorm(configs["d_model"]) + [ + EncoderLayer( + AttentionLayer(Attn(False, self.factor, attention_dropout=self.dropout, output_attention=self.output_attention), + self.d_model, self.n_heads, mix=False), + self.d_model, + self.d_ff, + dropout=self.dropout, + activation=self.activation + ) for l in range(self.e_layers) + ], + [ + ConvLayer( + self.d_model + ) for l in range(self.e_layers - 1) + ] if self.distil else None, + norm_layer=torch.nn.LayerNorm(self.d_model) ) - - # Forecast Head - self.head = TemporalProjectionHead( - d_model=configs["d_model"], - pred_len=configs["pred_len"], - c_out=configs["c_out"], + + # 解码器 + self.decoder = Decoder( + [ + DecoderLayer( + AttentionLayer(Attn(True, self.factor, attention_dropout=self.dropout, output_attention=False), + self.d_model, self.n_heads, mix=self.mix), + AttentionLayer(FullAttention(False, self.factor, attention_dropout=self.dropout, output_attention=False), + self.d_model, self.n_heads, mix=False), + self.d_model, + self.d_ff, + dropout=self.dropout, + activation=self.activation, + ) + for l in range(self.d_layers) + ], + norm_layer=torch.nn.LayerNorm(self.d_model) ) + + # 投影层 + self.projection = nn.Linear(self.d_model, self.c_out, bias=True) + + def forward(self, x_enc, x_mark_enc=None, x_dec=None, x_mark_dec=None, + enc_self_mask=None, dec_self_mask=None, dec_enc_mask=None): + # 如果没有提供x_dec和x_mark_dec,则根据x_enc和label_len生成 + if x_dec is None: + x_dec = torch.cat([x_enc[:, -self.label_len:, :], torch.zeros_like(x_enc[:, :self.pred_len, :])], dim=1) + if x_mark_dec is None and x_mark_enc is not None: + x_mark_dec = torch.cat([x_mark_enc[:, -self.label_len:, :], torch.zeros_like(x_mark_enc[:, :self.pred_len, :])], dim=1) + + # 编码 + enc_out = self.enc_embedding(x_enc, x_mark_enc) + enc_out, attns = self.encoder(enc_out, attn_mask=enc_self_mask) + + # 解码 + dec_out = self.dec_embedding(x_dec, x_mark_dec) + dec_out = self.decoder(dec_out, enc_out, x_mask=dec_self_mask, cross_mask=dec_enc_mask) + dec_out = self.projection(dec_out) + + if self.output_attention: + return dec_out[:, -self.pred_len:, :], attns + else: + return dec_out[:, -self.pred_len:, :] # [B, L, D] - def forward(self, x_enc): - x = self.embedding(x_enc) - x, _ = self.encoder(x) - out = self.head(x) - return out[:, -self.pred_len :, :] + +class InformerStack(nn.Module): + def __init__(self, configs): + super(InformerStack, self).__init__() + # 从configs中提取参数 + self.enc_in = configs.get("enc_in", 1) + self.dec_in = configs.get("dec_in", 1) + self.c_out = configs.get("c_out", 1) + self.seq_len = configs.get("seq_len", 96) + self.label_len = configs.get("label_len", 48) + self.out_len = configs.get("out_len", 24) + self.factor = configs.get("factor", 5) + self.d_model = configs.get("d_model", 512) + self.n_heads = configs.get("n_heads", 8) + self.e_layers = configs.get("e_layers", [3, 2, 1]) + self.d_layers = configs.get("d_layers", 2) + self.d_ff = configs.get("d_ff", 512) + self.dropout = configs.get("dropout", 0.0) + self.attn = configs.get("attn", "prob") + self.embed = configs.get("embed", "fixed") + self.freq = configs.get("freq", "h") + self.activation = configs.get("activation", "gelu") + self.output_attention = configs.get("output_attention", False) + self.distil = configs.get("distil", True) + self.mix = configs.get("mix", True) + self.device = configs.get("device", torch.device('cuda:0')) + + self.pred_len = self.out_len + + # 编码层 + self.enc_embedding = DataEmbedding(self.enc_in, self.d_model, self.embed, self.freq, self.dropout) + self.dec_embedding = DataEmbedding(self.dec_in, self.d_model, self.embed, self.freq, self.dropout) + + # 注意力层 + Attn = ProbAttention if self.attn == 'prob' else FullAttention + + # 编码器栈 + inp_lens = list(range(len(self.e_layers))) # [0,1,2,...] you can customize here + encoders = [ + Encoder( + [ + EncoderLayer( + AttentionLayer(Attn(False, self.factor, attention_dropout=self.dropout, output_attention=self.output_attention), + self.d_model, self.n_heads, mix=False), + self.d_model, + self.d_ff, + dropout=self.dropout, + activation=self.activation + ) for l in range(el) + ], + [ + ConvLayer( + self.d_model + ) for l in range(el-1) + ] if self.distil else None, + norm_layer=torch.nn.LayerNorm(self.d_model) + ) for el in self.e_layers] + self.encoder = EncoderStack(encoders, inp_lens) + + # 解码器 + self.decoder = Decoder( + [ + DecoderLayer( + AttentionLayer(Attn(True, self.factor, attention_dropout=self.dropout, output_attention=False), + self.d_model, self.n_heads, mix=self.mix), + AttentionLayer(FullAttention(False, self.factor, attention_dropout=self.dropout, output_attention=False), + self.d_model, self.n_heads, mix=False), + self.d_model, + self.d_ff, + dropout=self.dropout, + activation=self.activation, + ) + for l in range(self.d_layers) + ], + norm_layer=torch.nn.LayerNorm(self.d_model) + ) + + # 投影层 + self.projection = nn.Linear(self.d_model, self.c_out, bias=True) + + def forward(self, x_enc, x_mark_enc=None, x_dec=None, x_mark_dec=None, + enc_self_mask=None, dec_self_mask=None, dec_enc_mask=None): + # 如果没有提供x_dec和x_mark_dec,则根据x_enc和label_len生成 + if x_dec is None: + x_dec = torch.cat([x_enc[:, -self.label_len:, :], torch.zeros_like(x_enc[:, :self.pred_len, :])], dim=1) + if x_mark_dec is None and x_mark_enc is not None: + x_mark_dec = torch.cat([x_mark_enc[:, -self.label_len:, :], torch.zeros_like(x_mark_enc[:, :self.pred_len, :])], dim=1) + + # 编码 + enc_out = self.enc_embedding(x_enc, x_mark_enc) + enc_out, attns = self.encoder(enc_out, attn_mask=enc_self_mask) + + # 解码 + dec_out = self.dec_embedding(x_dec, x_mark_dec) + dec_out = self.decoder(dec_out, enc_out, x_mask=dec_self_mask, cross_mask=dec_enc_mask) + dec_out = self.projection(dec_out) + + if self.output_attention: + return dec_out[:, -self.pred_len:, :], attns + else: + return dec_out[:, -self.pred_len:, :] # [B, L, D] diff --git a/model/Informer/model_config.json b/model/Informer/model_config.json index 5c9c5ae..8e08e26 100644 --- a/model/Informer/model_config.json +++ b/model/Informer/model_config.json @@ -2,6 +2,11 @@ { "name": "Informer", "module": "model.Informer.model", - "entry": "InformerEncoder" + "entry": "Informer" + }, + { + "name": "InformerStack", + "module": "model.Informer.model", + "entry": "InformerStack" } ] \ No newline at end of file diff --git a/train.py b/train.py index 55662ba..8e2c6d3 100644 --- a/train.py +++ b/train.py @@ -12,9 +12,9 @@ def read_config(config_path): config = yaml.safe_load(file) # 全局配置 - device = "cuda:0" # 指定设备为cuda:0 + device = "cpu" # 指定设备为cuda:0 seed = 2023 # 随机种子 - epochs = 100 # 训练轮数 + epochs = 1 # 训练轮数 # 拷贝项 config["basic"]["device"] = device @@ -104,9 +104,14 @@ if __name__ == "__main__": # model_list = ["iTransformer", "PatchTST", "HI"] model_list = ["Informer"] # model_list = ["PatchTST"] - # dataset_list = ["AirQuality"] - # dataset_list = ["SolarEnergy"] - # dataset_list = ["BJTaxi-InFlow", "BJTaxi-OutFlow"] - dataset_list = ["SolarEnergy", "NYCBike-InFlow", "NYCBike-OutFlow", "METR-LA"] - # dataset_list = ["BJTaxi-OutFlow"] + + air = ["AirQuality"] + big_dataset = ["BJTaxi-InFlow", "BJTaxi-OutFlow"] + mid_dataset = ["PEMS-BAY"] + regular_dataset = ["AirQuality", "SolarEnergy", "NYCBike-InFlow", "NYCBike-OutFlow", "METR-LA"] + test_dataset = ["BJTaxi-InFlow"] + + all_dataset = big_dataset + mid_dataset + regular_dataset + + dataset_list = test_dataset main(model_list, dataset_list, debug=False)