REPST/scripts/report_generator.py

280 lines
11 KiB
Python

#!/usr/bin/env python3
# -*- coding: utf-8 -*-
"""
报告生成器
用于生成数据集分析的详细报告
"""
import json
import numpy as np
from pathlib import Path
from datetime import datetime
from typing import Dict, List
import matplotlib.pyplot as plt
import seaborn as sns
# 设置中文字体
plt.rcParams['font.sans-serif'] = ['SimHei', 'DejaVu Sans']
plt.rcParams['axes.unicode_minus'] = False
class ReportGenerator:
"""报告生成器"""
def __init__(self, analysis_results: Dict):
"""
初始化报告生成器
Args:
analysis_results: 数据集分析结果
"""
self.analysis_results = analysis_results
def generate_summary_report(self) -> str:
"""生成汇总报告"""
if not self.analysis_results:
return "没有可用的分析结果"
report = []
report.append("=" * 80)
report.append("BasicTS 数据集分析报告")
report.append("=" * 80)
report.append(f"生成时间: {datetime.now().strftime('%Y-%m-%d %H:%M:%S')}")
report.append(f"分析数据集数量: {len(self.analysis_results)}")
report.append("")
# 数据集概览表
report.append("数据集概览:")
report.append("-" * 80)
report.append(f"{'数据集名称':<15} {'领域':<20} {'时间步数':<10} {'节点数':<8} {'特征数':<8} {'频率(分钟)':<12} {'缺失值率(%)':<12}")
report.append("-" * 80)
for name, result in self.analysis_results.items():
basic = result['basic_info']
missing = result['missing_analysis']
spatial = result['spatial_analysis']
report.append(f"{name:<15} {basic['domain']:<20} {basic['shape'][0]:<10} "
f"{spatial['num_nodes']:<8} {spatial['num_features']:<8} "
f"{basic['frequency_minutes']:<12} {missing['total_missing_rate']:<12.3f}")
report.append("")
# 详细分析
for name, result in self.analysis_results.items():
report.append(f"数据集: {name}")
report.append("-" * 40)
basic = result['basic_info']
missing = result['missing_analysis']
temporal = result['temporal_analysis']
spatial = result['spatial_analysis']
report.append(f"领域: {basic['domain']}")
report.append(f"数据形状: {basic['shape']}")
report.append(f"时间频率: {basic['frequency_minutes']} 分钟")
report.append(f"时间跨度: {temporal['total_days']:.1f} 天 ({temporal['total_hours']:.1f} 小时)")
report.append(f"节点数量: {spatial['num_nodes']}")
report.append(f"特征数量: {spatial['num_features']}")
if spatial.get('num_edges'):
report.append(f"边数量: {spatial['num_edges']}")
report.append(f"边密度: {spatial['edge_density']:.4f}")
report.append(f"平均度数: {spatial['avg_degree']:.2f}")
report.append(f"缺失值率: {missing['total_missing_rate']:.3f}%")
report.append(f"数据密度: {temporal['data_density']:.3f}")
report.append("")
return "\n".join(report)
def generate_comparative_analysis(self) -> str:
"""生成对比分析报告"""
if not self.analysis_results:
return "没有可用的分析结果"
report = []
report.append("=" * 80)
report.append("数据集对比分析")
report.append("=" * 80)
report.append("")
# 按领域分组
domains = {}
for name, result in self.analysis_results.items():
domain = result['basic_info']['domain']
if domain not in domains:
domains[domain] = []
domains[domain].append((name, result))
for domain, datasets in domains.items():
report.append(f"领域: {domain}")
report.append("-" * 40)
# 该领域的数据集统计
missing_rates = [d[1]['missing_analysis']['total_missing_rate'] for d in datasets]
node_counts = [d[1]['spatial_analysis']['num_nodes'] for d in datasets]
time_steps = [d[1]['basic_info']['shape'][0] for d in datasets]
report.append(f"数据集数量: {len(datasets)}")
report.append(f"平均缺失值率: {np.mean(missing_rates):.3f}%")
report.append(f"缺失值率范围: {min(missing_rates):.3f}% - {max(missing_rates):.3f}%")
report.append(f"平均节点数: {np.mean(node_counts):.1f}")
report.append(f"节点数范围: {min(node_counts)} - {max(node_counts)}")
report.append(f"平均时间步数: {np.mean(time_steps):.0f}")
report.append("")
# 空间覆盖密度分析
report.append("空间覆盖密度分析:")
report.append("-" * 40)
spatial_datasets = [(name, result) for name, result in self.analysis_results.items()
if result['spatial_analysis'].get('num_edges')]
if spatial_datasets:
for name, result in spatial_datasets:
spatial = result['spatial_analysis']
report.append(f"{name}: {spatial['num_nodes']} 个节点, {spatial['num_edges']} 条边, "
f"密度 {spatial['edge_density']:.4f}, 平均度数 {spatial['avg_degree']:.2f}")
else:
report.append("没有发现包含图结构的数据集")
report.append("")
# 时间连续性分析
report.append("时间连续性分析:")
report.append("-" * 40)
temporal_data = []
for name, result in self.analysis_results.items():
temporal = result['temporal_analysis']
temporal_data.append({
'name': name,
'days': temporal['total_days'],
'density': temporal['data_density'],
'frequency': temporal['frequency_minutes']
})
# 按时间跨度排序
temporal_data.sort(key=lambda x: x['days'], reverse=True)
for data in temporal_data:
report.append(f"{data['name']}: {data['days']:.1f} 天, "
f"数据密度 {data['density']:.3f}, "
f"频率 {data['frequency']} 分钟")
return "\n".join(report)
def create_visualizations(self, output_dir: str = "analysis_reports"):
"""创建可视化图表"""
if not self.analysis_results:
print("没有可用的分析结果")
return
output_path = Path(output_dir)
output_path.mkdir(exist_ok=True)
# 设置图表样式
plt.style.use('seaborn-v0_8')
# 1. 缺失值率对比
fig, ax = plt.subplots(figsize=(12, 6))
names = list(self.analysis_results.keys())
missing_rates = [self.analysis_results[name]['missing_analysis']['total_missing_rate']
for name in names]
bars = ax.bar(names, missing_rates, color='skyblue', alpha=0.7)
ax.set_title('各数据集缺失值率对比', fontsize=14, fontweight='bold')
ax.set_xlabel('数据集名称')
ax.set_ylabel('缺失值率 (%)')
ax.tick_params(axis='x', rotation=45)
# 添加数值标签
for bar, rate in zip(bars, missing_rates):
ax.text(bar.get_x() + bar.get_width()/2, bar.get_height() + 0.1,
f'{rate:.2f}%', ha='center', va='bottom')
plt.tight_layout()
plt.savefig(output_path / "missing_rates_comparison.png", dpi=300, bbox_inches='tight')
plt.close()
# 2. 节点数量对比
fig, ax = plt.subplots(figsize=(12, 6))
node_counts = [self.analysis_results[name]['spatial_analysis']['num_nodes']
for name in names]
bars = ax.bar(names, node_counts, color='lightgreen', alpha=0.7)
ax.set_title('各数据集节点数量对比', fontsize=14, fontweight='bold')
ax.set_xlabel('数据集名称')
ax.set_ylabel('节点数量')
ax.tick_params(axis='x', rotation=45)
# 添加数值标签
for bar, count in zip(bars, node_counts):
ax.text(bar.get_x() + bar.get_width()/2, bar.get_height() + max(node_counts)*0.01,
f'{count}', ha='center', va='bottom')
plt.tight_layout()
plt.savefig(output_path / "node_counts_comparison.png", dpi=300, bbox_inches='tight')
plt.close()
# 3. 时间跨度对比
fig, ax = plt.subplots(figsize=(12, 6))
time_days = [self.analysis_results[name]['temporal_analysis']['total_days']
for name in names]
bars = ax.bar(names, time_days, color='orange', alpha=0.7)
ax.set_title('各数据集时间跨度对比', fontsize=14, fontweight='bold')
ax.set_xlabel('数据集名称')
ax.set_ylabel('时间跨度 (天)')
ax.tick_params(axis='x', rotation=45)
# 添加数值标签
for bar, days in zip(bars, time_days):
ax.text(bar.get_x() + bar.get_width()/2, bar.get_height() + max(time_days)*0.01,
f'{days:.1f}', ha='center', va='bottom')
plt.tight_layout()
plt.savefig(output_path / "time_span_comparison.png", dpi=300, bbox_inches='tight')
plt.close()
# 4. 散点图:节点数 vs 缺失值率
fig, ax = plt.subplots(figsize=(10, 6))
ax.scatter(node_counts, missing_rates, s=100, alpha=0.7, c='red')
# 添加数据集标签
for i, name in enumerate(names):
ax.annotate(name, (node_counts[i], missing_rates[i]),
xytext=(5, 5), textcoords='offset points', fontsize=8)
ax.set_xlabel('节点数量')
ax.set_ylabel('缺失值率 (%)')
ax.set_title('节点数量与缺失值率关系', fontsize=14, fontweight='bold')
ax.grid(True, alpha=0.3)
plt.tight_layout()
plt.savefig(output_path / "nodes_vs_missing_rates.png", dpi=300, bbox_inches='tight')
plt.close()
print(f"可视化图表已保存到目录: {output_path}")
def save_reports(self, output_dir: str = "analysis_reports"):
"""保存分析报告"""
output_path = Path(output_dir)
output_path.mkdir(exist_ok=True)
# 保存汇总报告
summary_report = self.generate_summary_report()
with open(output_path / "summary_report.txt", 'w', encoding='utf-8') as f:
f.write(summary_report)
# 保存对比分析报告
comparative_report = self.generate_comparative_analysis()
with open(output_path / "comparative_analysis.txt", 'w', encoding='utf-8') as f:
f.write(comparative_report)
# 保存详细JSON报告
with open(output_path / "detailed_analysis.json", 'w', encoding='utf-8') as f:
json.dump(self.analysis_results, f, indent=2, ensure_ascii=False, default=str)
print(f"报告已保存到目录: {output_path}")