REPST #3

Merged
czzhangheng merged 42 commits from REPST into main 2025-12-20 16:03:22 +08:00
14 changed files with 85 additions and 159 deletions
Showing only changes of commit 5e52f23c8d - Show all commits

View File

@ -13,7 +13,7 @@ def read_config(config_path):
# 全局配置 # 全局配置
device = "cuda:0" # 指定设备为cuda:0 device = "cuda:0" # 指定设备为cuda:0
seed = 2023 # 随机种子 seed = 2023 # 随机种子
epochs = 1 # 训练轮数 epochs = 100 # 训练轮数
# 拷贝项 # 拷贝项
config["basic"]["device"] = device config["basic"]["device"] = device
@ -91,8 +91,8 @@ if __name__ == "__main__":
# 调试用 # 调试用
# model_list = ["iTransformer", "PatchTST", "HI"] # model_list = ["iTransformer", "PatchTST", "HI"]
# model_list = ["ASTRA_v2", "GWN", "REPST", "STAEFormer", "MTGNN"] # model_list = ["ASTRA_v2", "GWN", "REPST", "STAEFormer", "MTGNN"]
model_list = ["MTGNN"] model_list = ["iTransformer"]
# dataset_list = ["AirQuality", "SolarEnergy", "PEMS-BAY", "METR-LA", "BJTaxi-InFlow", "BJTaxi-OutFlow", "NYCBike-InFlow", "NYCBike-OutFlow"] dataset_list = ["AirQuality", "SolarEnergy", "PEMS-BAY", "METR-LA", "BJTaxi-InFlow", "BJTaxi-OutFlow", "NYCBike-InFlow", "NYCBike-OutFlow"]
dataset_list = ["AirQuality"] # dataset_list = ["AirQuality"]
# dataset_list = ["AirQuality", "SolarEnergy", "METR-LA", "NYCBike-InFlow", "NYCBike-OutFlow"] # dataset_list = ["AirQuality", "SolarEnergy", "METR-LA", "NYCBike-InFlow", "NYCBike-OutFlow"]
main(model_list, dataset_list, debug = False) main(model_list, dataset_list, debug = True)

View File

@ -3,25 +3,13 @@ from tqdm import tqdm
from utils.logger import get_logger from utils.logger import get_logger
from utils.loss_function import all_metrics from utils.loss_function import all_metrics
class TSWrapper:
def __init__(self, args):
self.n = args['data']['num_nodes']
def forward(self, x):
# [b, t, n, c] -> [b*n, t, c]
b, t, n, c = x.shape
x = x[..., :-2].permute(0, 2, 1, 3).reshape(b * n, t, c-2)
return x, b, t, n, c
def inverse(self, x, b, t, n, c):
return x.reshape(b, n, t, c-2).permute(0, 2, 1, 3)
class Trainer: class Trainer:
def __init__(self, model, loss, optimizer, def __init__(self, model, loss, optimizer,
train_loader, val_loader, test_loader, train_loader, val_loader, test_loader,
scaler, args, lr_scheduler=None): scaler, args, lr_scheduler=None):
self.config = args
self.device = args["basic"]["device"] self.device = args["basic"]["device"]
self.args = args["train"] self.args = args["train"]
@ -35,7 +23,24 @@ class Trainer:
self.test_loader = test_loader self.test_loader = test_loader
self.scaler = scaler self.scaler = scaler
self.ts = TSWrapper(args) # ---------- shape magic (replace TSWrapper) ----------
self.pack = lambda x: (
x[..., :-2]
.permute(0, 2, 1, 3)
.reshape(-1, x.size(1), x.size(3) - 2),
x.shape
)
self.unpack = lambda y, s: (
y.reshape(s[0], s[2], s[1], -1)
.permute(0, 2, 1, 3)
)
# ---------- inverse scaler ----------
self.inv = lambda x: torch.cat(
[s.inverse_transform(x[..., i:i+1]) for i, s in enumerate(self.scaler)],
dim=-1
)
self._init_paths() self._init_paths()
self._init_logger() self._init_logger()
@ -51,7 +56,7 @@ class Trainer:
self.logger = get_logger( self.logger = get_logger(
self.args["log_dir"], self.args["log_dir"],
name=self.model.__class__.__name__, name=self.model.__class__.__name__,
debug=self.args["debug"], debug=self.args["debug"]
) )
# ---------------- epoch ---------------- # ---------------- epoch ----------------
@ -67,21 +72,17 @@ class Trainer:
data, target = data.to(self.device), target.to(self.device) data, target = data.to(self.device), target.to(self.device)
label = target[..., :self.args["output_dim"]] label = target[..., :self.args["output_dim"]]
x, b, t, n, c = self.ts.forward(data) x, shp = self.pack(data)
out = self.model(x) out = self.unpack(self.model(x), shp)
out = self.ts.inverse(out, b, t, n, c)
if os.environ.get("TRY") == "True": if os.environ.get("TRY") == "True":
if out.shape == label.shape: print(f"out:{out.shape} label:{label.shape}",
print("shape true") "" if out.shape == label.shape else "")
assert False
else:
print("shape false")
assert False assert False
loss = self.loss(out, label) loss = self.loss(out, label)
d_out = self.scaler.inverse_transform(out)
d_lbl = self.scaler.inverse_transform(label) d_out, d_lbl = self.inv(out), self.inv(label)
d_loss = self.loss(d_out, d_lbl) d_loss = self.loss(d_out, d_lbl)
total_loss += d_loss.item() total_loss += d_loss.item()
@ -98,9 +99,7 @@ class Trainer:
) )
self.optimizer.step() self.optimizer.step()
y_pred = torch.cat(y_pred) y_pred, y_true = torch.cat(y_pred), torch.cat(y_true)
y_true = torch.cat(y_true)
mae, rmse, mape = all_metrics( mae, rmse, mape = all_metrics(
y_pred, y_true, y_pred, y_true,
self.args["mae_thresh"], self.args["mae_thresh"],
@ -110,23 +109,28 @@ class Trainer:
self.logger.info( self.logger.info(
f"Epoch #{epoch:02d} {mode:<5} " f"Epoch #{epoch:02d} {mode:<5} "
f"MAE:{mae:5.2f} RMSE:{rmse:5.2f} " f"MAE:{mae:5.2f} RMSE:{rmse:5.2f} "
f"MAPE:{mape:7.4f} Time:{time.time()-start:.2f}s" f"MAPE:{mape:7.4f} "
f"Time:{time.time() - start:.2f}s"
) )
return total_loss / len(loader) return total_loss / len(loader)
# ---------------- train ---------------- # ---------------- train ----------------
def train(self): def train(self):
best, best_test = float("inf"), float("inf") best = best_test = float("inf")
best_w, best_test_w = None, None best_w = best_test_w = None
patience = 0 patience = 0
self.logger.info("Training started") self.logger.info("Training started")
for epoch in range(1, self.args["epochs"] + 1): for epoch in range(1, self.args["epochs"] + 1):
losses = { losses = {
"train": self._run_epoch(epoch, self.train_loader, "train"), k: self._run_epoch(epoch, l, k)
"val": self._run_epoch(epoch, self.val_loader, "val"), for k, l in [
"test": self._run_epoch(epoch, self.test_loader, "test"), ("train", self.train_loader),
("val", self.val_loader),
("test", self.test_loader)
]
} }
if losses["train"] > 1e6: if losses["train"] > 1e6:
@ -171,15 +175,14 @@ class Trainer:
data, target = data.to(self.device), target.to(self.device) data, target = data.to(self.device), target.to(self.device)
label = target[..., :self.args["output_dim"]] label = target[..., :self.args["output_dim"]]
x, b, t, n, c = self.ts.forward(data) x, shp = self.pack(data)
out = self.model(x) out = self.unpack(self.model(x), shp)
out = self.ts.inverse(out, b, t, n, c)
y_pred.append(out.cpu()) y_pred.append(out.cpu())
y_true.append(label.cpu()) y_true.append(label.cpu())
d_pred = self.scaler.inverse_transform(torch.cat(y_pred)) d_pred = self.inv(torch.cat(y_pred))
d_true = self.scaler.inverse_transform(torch.cat(y_true)) d_true = self.inv(torch.cat(y_true))
for t in range(d_true.shape[1]): for t in range(d_true.shape[1]):
mae, rmse, mape = all_metrics( mae, rmse, mape = all_metrics(
@ -188,11 +191,15 @@ class Trainer:
self.args["mape_thresh"] self.args["mape_thresh"]
) )
self.logger.info( self.logger.info(
f"Horizon {t+1:02d} MAE:{mae:.4f} RMSE:{rmse:.4f} MAPE:{mape:.4f}" f"Horizon {t+1:02d} "
f"MAE:{mae:.4f} RMSE:{rmse:.4f} MAPE:{mape:.4f}"
) )
avg_mae, avg_rmse, avg_mape = all_metrics(d_pred, d_true, self.args["mae_thresh"], self.args["mape_thresh"]) mae, rmse, mape = all_metrics(
d_pred, d_true,
self.args["mae_thresh"],
self.args["mape_thresh"]
)
self.logger.info( self.logger.info(
f"AVG MAE:{avg_mae:.4f} AVG RMSE:{avg_rmse:.4f} AVG MAPE:{avg_mape:.4f}" f"AVG MAE:{mae:.4f} AVG RMSE:{rmse:.4f} AVG MAPE:{mape:.4f}"
) )

View File

@ -3,59 +3,29 @@ from tqdm import tqdm
from utils.logger import get_logger from utils.logger import get_logger
from utils.loss_function import all_metrics from utils.loss_function import all_metrics
class Trainer: class Trainer:
def __init__(self, model, loss, optimizer, def __init__(self, model, loss, optimizer, train_loader, val_loader, test_loader, scaler, args, lr_scheduler=None):
train_loader, val_loader, test_loader, self.device, self.args = args["basic"]["device"], args["train"]
scaler, args, lr_scheduler=None): self.model, self.loss, self.optimizer, self.lr_scheduler = model.to(self.device), loss, optimizer, lr_scheduler
self.train_loader, self.val_loader, self.test_loader = train_loader, val_loader or test_loader, test_loader
self.config = args
self.device = args["basic"]["device"]
self.args = args["train"]
self.model = model.to(self.device)
self.loss = loss
self.optimizer = optimizer
self.lr_scheduler = lr_scheduler
self.train_loader = train_loader
self.val_loader = val_loader or test_loader
self.test_loader = test_loader
self.scaler = scaler self.scaler = scaler
self.inv = lambda x: torch.cat([s.inverse_transform(x[..., i:i+1]) for i, s in enumerate(self.scaler)], dim=-1) # 对每个维度调用反归一化器后cat
# ===== 新增:统一反归一化接口(单 scaler / 多 scaler 通吃)=====
self.inv = (
(lambda x: self.scaler.inverse_transform(x))
if not isinstance(self.scaler, (list, tuple))
else (lambda x: torch.cat(
[s.inverse_transform(x[..., i:i+1])
for i, s in enumerate(self.scaler)],
dim=-1))
)
self._init_paths() self._init_paths()
self._init_logger() self._init_logger()
# ---------------- init ---------------- # ---------------- init ----------------
def _init_paths(self): def _init_paths(self):
d = self.args["log_dir"] d = self.args["log_dir"]
self.best_path = os.path.join(d, "best_model.pth") self.best_path, self.best_test_path = os.path.join(d, "best_model.pth"), os.path.join(d, "best_test_model.pth")
self.best_test_path = os.path.join(d, "best_test_model.pth")
def _init_logger(self): def _init_logger(self):
if not self.args["debug"]: if not self.args["debug"]: os.makedirs(self.args["log_dir"], exist_ok=True)
os.makedirs(self.args["log_dir"], exist_ok=True) self.logger = get_logger(self.args["log_dir"], name=self.model.__class__.__name__, debug=self.args["debug"])
self.logger = get_logger(
self.args["log_dir"],
name=self.model.__class__.__name__,
debug=self.args["debug"],
)
# ---------------- epoch ---------------- # ---------------- epoch ----------------
def _run_epoch(self, epoch, loader, mode): def _run_epoch(self, epoch, loader, mode):
is_train = mode == "train" is_train = mode == "train"
self.model.train() if is_train else self.model.eval() self.model.train() if is_train else self.model.eval()
total_loss, start = 0.0, time.time() total_loss, start = 0.0, time.time()
y_pred, y_true = [], [] y_pred, y_true = [], []
@ -63,20 +33,12 @@ class Trainer:
for data, target in tqdm(loader, desc=f"{mode} {epoch}", total=len(loader)): for data, target in tqdm(loader, desc=f"{mode} {epoch}", total=len(loader)):
data, target = data.to(self.device), target.to(self.device) data, target = data.to(self.device), target.to(self.device)
label = target[..., :self.args["output_dim"]] label = target[..., :self.args["output_dim"]]
out = self.model(data) out = self.model(data)
if os.environ.get("TRY") == "True": print(f"out: {out.shape}, label: {label.shape} \
if os.environ.get("TRY") == "True": {'' if out.shape == label.shape else ''}"); assert False
print(f"out: {out.shape}, label: {label.shape}")
assert False
loss = self.loss(out, label) loss = self.loss(out, label)
d_out, d_lbl = self.inv(out), self.inv(label) # 反归一化
# ===== 修改点:反归一化 =====
d_out = self.inv(out)
d_lbl = self.inv(label)
d_loss = self.loss(d_out, d_lbl) d_loss = self.loss(d_out, d_lbl)
total_loss += d_loss.item() total_loss += d_loss.item()
y_pred.append(d_out.detach().cpu()) y_pred.append(d_out.detach().cpu())
y_true.append(d_lbl.detach().cpu()) y_true.append(d_lbl.detach().cpu())
@ -84,27 +46,12 @@ class Trainer:
if is_train and self.optimizer: if is_train and self.optimizer:
self.optimizer.zero_grad() self.optimizer.zero_grad()
loss.backward() loss.backward()
if self.args["grad_norm"]: if self.args["grad_norm"]: torch.nn.utils.clip_grad_norm_(self.model.parameters(), self.args["max_grad_norm"])
torch.nn.utils.clip_grad_norm_(
self.model.parameters(),
self.args["max_grad_norm"]
)
self.optimizer.step() self.optimizer.step()
y_pred = torch.cat(y_pred) y_pred, y_true = torch.cat(y_pred), torch.cat(y_true)
y_true = torch.cat(y_true) mae, rmse, mape = all_metrics(y_pred, y_true, self.args["mae_thresh"], self.args["mape_thresh"])
self.logger.info(f"Epoch #{epoch:02d} {mode:<5} MAE:{mae:5.2f} RMSE:{rmse:5.2f} MAPE:{mape:7.4f} Time:{time.time()-start:.2f}s")
mae, rmse, mape = all_metrics(
y_pred, y_true,
self.args["mae_thresh"],
self.args["mape_thresh"]
)
self.logger.info(
f"Epoch #{epoch:02d} {mode:<5} "
f"MAE:{mae:5.2f} RMSE:{rmse:5.2f} "
f"MAPE:{mape:7.4f} Time:{time.time()-start:.2f}s"
)
return total_loss / len(loader) return total_loss / len(loader)
# ---------------- train ---------------- # ---------------- train ----------------
@ -112,7 +59,6 @@ class Trainer:
best, best_test = float("inf"), float("inf") best, best_test = float("inf"), float("inf")
best_w, best_test_w = None, None best_w, best_test_w = None, None
patience = 0 patience = 0
self.logger.info("Training started") self.logger.info("Training started")
for epoch in range(1, self.args["epochs"] + 1): for epoch in range(1, self.args["epochs"] + 1):
@ -122,27 +68,15 @@ class Trainer:
"test": self._run_epoch(epoch, self.test_loader, "test"), "test": self._run_epoch(epoch, self.test_loader, "test"),
} }
if losses["train"] > 1e6: if losses["train"] > 1e6: self.logger.warning("Gradient explosion detected"); break
self.logger.warning("Gradient explosion detected") if losses["val"] < best: best, patience, best_w = losses["val"], 0, copy.deepcopy(self.model.state_dict())
break else: patience += 1
if self.args["early_stop"] and patience == self.args["early_stop_patience"]: break
if losses["val"] < best: if losses["test"] < best_test: best_test, best_test_w = losses["test"], copy.deepcopy(self.model.state_dict())
best, patience = losses["val"], 0
best_w = copy.deepcopy(self.model.state_dict())
else:
patience += 1
if self.args["early_stop"] and patience == self.args["early_stop_patience"]:
break
if losses["test"] < best_test:
best_test = losses["test"]
best_test_w = copy.deepcopy(self.model.state_dict())
if not self.args["debug"]: if not self.args["debug"]:
torch.save(best_w, self.best_path) torch.save(best_w, self.best_path)
torch.save(best_test_w, self.best_test_path) torch.save(best_test_w, self.best_test_path)
self._final_test(best_w, best_test_w) self._final_test(best_w, best_test_w)
# ---------------- final test ---------------- # ---------------- final test ----------------
@ -164,25 +98,10 @@ class Trainer:
y_pred.append(self.model(data).cpu()) y_pred.append(self.model(data).cpu())
y_true.append(label.cpu()) y_true.append(label.cpu())
# ===== 修改点:反归一化 ===== d_pred, d_true = self.inv(torch.cat(y_pred)), self.inv(torch.cat(y_true)) # 反归一化
d_pred = self.inv(torch.cat(y_pred))
d_true = self.inv(torch.cat(y_true))
for t in range(d_true.shape[1]): for t in range(d_true.shape[1]):
mae, rmse, mape = all_metrics( mae, rmse, mape = all_metrics(d_pred[:, t], d_true[:, t], self.args["mae_thresh"], self.args["mape_thresh"])
d_pred[:, t], d_true[:, t], self.logger.info(f"Horizon {t+1:02d} MAE:{mae:.4f} RMSE:{rmse:.4f} MAPE:{mape:.4f}")
self.args["mae_thresh"],
self.args["mape_thresh"]
)
self.logger.info(
f"Horizon {t+1:02d} MAE:{mae:.4f} RMSE:{rmse:.4f} MAPE:{mape:.4f}"
)
avg_mae, avg_rmse, avg_mape = all_metrics( avg_mae, avg_rmse, avg_mape = all_metrics(d_pred, d_true, self.args["mae_thresh"], self.args["mape_thresh"])
d_pred, d_true, self.logger.info(f"AVG MAE:{avg_mae:.4f} AVG RMSE:{avg_rmse:.4f} AVG MAPE:{avg_mape:.4f}")
self.args["mae_thresh"],
self.args["mape_thresh"]
)
self.logger.info(
f"AVG MAE:{avg_mae:.4f} AVG RMSE:{avg_rmse:.4f} AVG MAPE:{avg_mape:.4f}"
)