#!/usr/bin/env python3 # -*- coding: utf-8 -*- """ 报告生成器 用于生成数据集分析的详细报告 """ import json import numpy as np from pathlib import Path from datetime import datetime from typing import Dict, List import matplotlib.pyplot as plt import seaborn as sns # 设置中文字体 plt.rcParams['font.sans-serif'] = ['SimHei', 'DejaVu Sans'] plt.rcParams['axes.unicode_minus'] = False class ReportGenerator: """报告生成器""" def __init__(self, analysis_results: Dict): """ 初始化报告生成器 Args: analysis_results: 数据集分析结果 """ self.analysis_results = analysis_results def generate_summary_report(self) -> str: """生成汇总报告""" if not self.analysis_results: return "没有可用的分析结果" report = [] report.append("=" * 80) report.append("BasicTS 数据集分析报告") report.append("=" * 80) report.append(f"生成时间: {datetime.now().strftime('%Y-%m-%d %H:%M:%S')}") report.append(f"分析数据集数量: {len(self.analysis_results)}") report.append("") # 数据集概览表 report.append("数据集概览:") report.append("-" * 80) report.append(f"{'数据集名称':<15} {'领域':<20} {'时间步数':<10} {'节点数':<8} {'特征数':<8} {'频率(分钟)':<12} {'缺失值率(%)':<12}") report.append("-" * 80) for name, result in self.analysis_results.items(): basic = result['basic_info'] missing = result['missing_analysis'] spatial = result['spatial_analysis'] report.append(f"{name:<15} {basic['domain']:<20} {basic['shape'][0]:<10} " f"{spatial['num_nodes']:<8} {spatial['num_features']:<8} " f"{basic['frequency_minutes']:<12} {missing['total_missing_rate']:<12.3f}") report.append("") # 详细分析 for name, result in self.analysis_results.items(): report.append(f"数据集: {name}") report.append("-" * 40) basic = result['basic_info'] missing = result['missing_analysis'] temporal = result['temporal_analysis'] spatial = result['spatial_analysis'] report.append(f"领域: {basic['domain']}") report.append(f"数据形状: {basic['shape']}") report.append(f"时间频率: {basic['frequency_minutes']} 分钟") report.append(f"时间跨度: {temporal['total_days']:.1f} 天 ({temporal['total_hours']:.1f} 小时)") report.append(f"节点数量: {spatial['num_nodes']}") report.append(f"特征数量: {spatial['num_features']}") if spatial.get('num_edges'): report.append(f"边数量: {spatial['num_edges']}") report.append(f"边密度: {spatial['edge_density']:.4f}") report.append(f"平均度数: {spatial['avg_degree']:.2f}") report.append(f"缺失值率: {missing['total_missing_rate']:.3f}%") report.append(f"数据密度: {temporal['data_density']:.3f}") report.append("") return "\n".join(report) def generate_comparative_analysis(self) -> str: """生成对比分析报告""" if not self.analysis_results: return "没有可用的分析结果" report = [] report.append("=" * 80) report.append("数据集对比分析") report.append("=" * 80) report.append("") # 按领域分组 domains = {} for name, result in self.analysis_results.items(): domain = result['basic_info']['domain'] if domain not in domains: domains[domain] = [] domains[domain].append((name, result)) for domain, datasets in domains.items(): report.append(f"领域: {domain}") report.append("-" * 40) # 该领域的数据集统计 missing_rates = [d[1]['missing_analysis']['total_missing_rate'] for d in datasets] node_counts = [d[1]['spatial_analysis']['num_nodes'] for d in datasets] time_steps = [d[1]['basic_info']['shape'][0] for d in datasets] report.append(f"数据集数量: {len(datasets)}") report.append(f"平均缺失值率: {np.mean(missing_rates):.3f}%") report.append(f"缺失值率范围: {min(missing_rates):.3f}% - {max(missing_rates):.3f}%") report.append(f"平均节点数: {np.mean(node_counts):.1f}") report.append(f"节点数范围: {min(node_counts)} - {max(node_counts)}") report.append(f"平均时间步数: {np.mean(time_steps):.0f}") report.append("") # 空间覆盖密度分析 report.append("空间覆盖密度分析:") report.append("-" * 40) spatial_datasets = [(name, result) for name, result in self.analysis_results.items() if result['spatial_analysis'].get('num_edges')] if spatial_datasets: for name, result in spatial_datasets: spatial = result['spatial_analysis'] report.append(f"{name}: {spatial['num_nodes']} 个节点, {spatial['num_edges']} 条边, " f"密度 {spatial['edge_density']:.4f}, 平均度数 {spatial['avg_degree']:.2f}") else: report.append("没有发现包含图结构的数据集") report.append("") # 时间连续性分析 report.append("时间连续性分析:") report.append("-" * 40) temporal_data = [] for name, result in self.analysis_results.items(): temporal = result['temporal_analysis'] temporal_data.append({ 'name': name, 'days': temporal['total_days'], 'density': temporal['data_density'], 'frequency': temporal['frequency_minutes'] }) # 按时间跨度排序 temporal_data.sort(key=lambda x: x['days'], reverse=True) for data in temporal_data: report.append(f"{data['name']}: {data['days']:.1f} 天, " f"数据密度 {data['density']:.3f}, " f"频率 {data['frequency']} 分钟") return "\n".join(report) def create_visualizations(self, output_dir: str = "analysis_reports"): """创建可视化图表""" if not self.analysis_results: print("没有可用的分析结果") return output_path = Path(output_dir) output_path.mkdir(exist_ok=True) # 设置图表样式 plt.style.use('seaborn-v0_8') # 1. 缺失值率对比 fig, ax = plt.subplots(figsize=(12, 6)) names = list(self.analysis_results.keys()) missing_rates = [self.analysis_results[name]['missing_analysis']['total_missing_rate'] for name in names] bars = ax.bar(names, missing_rates, color='skyblue', alpha=0.7) ax.set_title('各数据集缺失值率对比', fontsize=14, fontweight='bold') ax.set_xlabel('数据集名称') ax.set_ylabel('缺失值率 (%)') ax.tick_params(axis='x', rotation=45) # 添加数值标签 for bar, rate in zip(bars, missing_rates): ax.text(bar.get_x() + bar.get_width()/2, bar.get_height() + 0.1, f'{rate:.2f}%', ha='center', va='bottom') plt.tight_layout() plt.savefig(output_path / "missing_rates_comparison.png", dpi=300, bbox_inches='tight') plt.close() # 2. 节点数量对比 fig, ax = plt.subplots(figsize=(12, 6)) node_counts = [self.analysis_results[name]['spatial_analysis']['num_nodes'] for name in names] bars = ax.bar(names, node_counts, color='lightgreen', alpha=0.7) ax.set_title('各数据集节点数量对比', fontsize=14, fontweight='bold') ax.set_xlabel('数据集名称') ax.set_ylabel('节点数量') ax.tick_params(axis='x', rotation=45) # 添加数值标签 for bar, count in zip(bars, node_counts): ax.text(bar.get_x() + bar.get_width()/2, bar.get_height() + max(node_counts)*0.01, f'{count}', ha='center', va='bottom') plt.tight_layout() plt.savefig(output_path / "node_counts_comparison.png", dpi=300, bbox_inches='tight') plt.close() # 3. 时间跨度对比 fig, ax = plt.subplots(figsize=(12, 6)) time_days = [self.analysis_results[name]['temporal_analysis']['total_days'] for name in names] bars = ax.bar(names, time_days, color='orange', alpha=0.7) ax.set_title('各数据集时间跨度对比', fontsize=14, fontweight='bold') ax.set_xlabel('数据集名称') ax.set_ylabel('时间跨度 (天)') ax.tick_params(axis='x', rotation=45) # 添加数值标签 for bar, days in zip(bars, time_days): ax.text(bar.get_x() + bar.get_width()/2, bar.get_height() + max(time_days)*0.01, f'{days:.1f}', ha='center', va='bottom') plt.tight_layout() plt.savefig(output_path / "time_span_comparison.png", dpi=300, bbox_inches='tight') plt.close() # 4. 散点图:节点数 vs 缺失值率 fig, ax = plt.subplots(figsize=(10, 6)) ax.scatter(node_counts, missing_rates, s=100, alpha=0.7, c='red') # 添加数据集标签 for i, name in enumerate(names): ax.annotate(name, (node_counts[i], missing_rates[i]), xytext=(5, 5), textcoords='offset points', fontsize=8) ax.set_xlabel('节点数量') ax.set_ylabel('缺失值率 (%)') ax.set_title('节点数量与缺失值率关系', fontsize=14, fontweight='bold') ax.grid(True, alpha=0.3) plt.tight_layout() plt.savefig(output_path / "nodes_vs_missing_rates.png", dpi=300, bbox_inches='tight') plt.close() print(f"可视化图表已保存到目录: {output_path}") def save_reports(self, output_dir: str = "analysis_reports"): """保存分析报告""" output_path = Path(output_dir) output_path.mkdir(exist_ok=True) # 保存汇总报告 summary_report = self.generate_summary_report() with open(output_path / "summary_report.txt", 'w', encoding='utf-8') as f: f.write(summary_report) # 保存对比分析报告 comparative_report = self.generate_comparative_analysis() with open(output_path / "comparative_analysis.txt", 'w', encoding='utf-8') as f: f.write(comparative_report) # 保存详细JSON报告 with open(output_path / "detailed_analysis.json", 'w', encoding='utf-8') as f: json.dump(self.analysis_results, f, indent=2, ensure_ascii=False, default=str) print(f"报告已保存到目录: {output_path}")