wind_power.py 6.9 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178
  1. """
  2. Module 1: 风速-功率异常检测
  3. 优化点:
  4. - PowerCurveDetector 改为逐点输出:训练时保存 bin 级正常范围 (mean±σ),
  5. predict 时将每个点映射到所属 bin,计算 z-score 作为逐点异常分数
  6. - ScatterDetector 增加环境温度特征(如果 ambient_temp 列存在),
  7. 因为空气密度影响功率输出
  8. - 训练/预测时过滤有效发电区间 (WIND_VALID_MIN ~ WIND_VALID_MAX)
  9. - 分箱样本量权重过滤,样本不足的箱不参与训练
  10. 子检测器:
  11. A. PowerCurveDetector - 基于分箱统计的功率曲线异常(z-score 逐点)
  12. B. ScatterDetector - 基于原始散点的风速-功率异常(IsolationForest)
  13. """
  14. import numpy as np
  15. import pandas as pd
  16. from sklearn.ensemble import IsolationForest
  17. from sklearn.preprocessing import StandardScaler
  18. import joblib
  19. from pathlib import Path
  20. from config import (
  21. COL_WIND_SPD, COL_P_ACTIVE, COL_AMBIENT_TEMP,
  22. WIND_BIN_WIDTH, WIND_BIN_MIN, WIND_BIN_MAX,
  23. ISO_CONTAMINATION, ISO_RANDOM_STATE, ISO_N_ESTIMATORS,
  24. WIND_VALID_MIN, WIND_VALID_MAX,
  25. )
  26. # 分箱最小样本量,样本不足的箱不参与训练
  27. _MIN_BIN_COUNT = 30
  28. # 功率偏离阈值(标准差倍数),按风速区间分段
  29. # 低风速段波动大,阈值放宽;高风速段接近额定,阈值收紧
  30. _SIGMA_LOW = 4.0 # < 8 m/s
  31. _SIGMA_MID = 3.0 # 8~16 m/s
  32. _SIGMA_HIGH = 2.5 # > 16 m/s
  33. def _get_sigma(wind_spd: float) -> float:
  34. if wind_spd < 8.0:
  35. return _SIGMA_LOW
  36. elif wind_spd <= 16.0:
  37. return _SIGMA_MID
  38. else:
  39. return _SIGMA_HIGH
  40. def _bin_wind_speed(series: pd.Series) -> pd.Series:
  41. bins = np.arange(WIND_BIN_MIN, WIND_BIN_MAX + WIND_BIN_WIDTH, WIND_BIN_WIDTH)
  42. return pd.cut(series, bins=bins, labels=False)
  43. def _filter_valid_wind(df: pd.DataFrame) -> pd.DataFrame:
  44. """过滤有效发电风速区间,排除切入/切出段噪声。"""
  45. mask = (df[COL_WIND_SPD] >= WIND_VALID_MIN) & (df[COL_WIND_SPD] <= WIND_VALID_MAX)
  46. return df[mask]
  47. # ── A. 功率曲线检测器(逐点输出) ──────────────────────────────────────────────
  48. class PowerCurveDetector:
  49. """
  50. 训练: 按风速分箱统计每个 bin 的 (mean, std),保存为正常范围。
  51. 预测: 将每个数据点映射到所属 bin,计算功率偏离度 z-score,
  52. 按风速区间分段 sigma 判异常(低风放宽/高风收紧)。
  53. 输出与原始数据等长。
  54. """
  55. def __init__(self):
  56. self.bin_stats: pd.DataFrame = pd.DataFrame()
  57. def fit(self, df: pd.DataFrame) -> "PowerCurveDetector":
  58. d = _filter_valid_wind(df[[COL_WIND_SPD, COL_P_ACTIVE]].copy())
  59. d["wind_bin"] = _bin_wind_speed(d[COL_WIND_SPD])
  60. stats = (
  61. d.groupby("wind_bin")[COL_P_ACTIVE]
  62. .agg(mean_power="mean", std_power="std", count="count")
  63. .reset_index()
  64. .dropna()
  65. )
  66. stats = stats[stats["count"] >= _MIN_BIN_COUNT]
  67. # std 为 0 时用全局 std 兜底
  68. global_std = d[COL_P_ACTIVE].std()
  69. stats["std_power"] = stats["std_power"].replace(0, global_std)
  70. if len(stats) < 3:
  71. raise ValueError("功率曲线有效分箱不足")
  72. self.bin_stats = stats
  73. return self
  74. def predict(self, df: pd.DataFrame) -> pd.DataFrame:
  75. # 保留原始索引,最终结果与输入等长
  76. out = pd.DataFrame({"anomaly": False, "score": np.nan}, index=df.index)
  77. valid_mask = (
  78. df[COL_WIND_SPD].notna() & df[COL_P_ACTIVE].notna() &
  79. (df[COL_WIND_SPD] >= WIND_VALID_MIN) & (df[COL_WIND_SPD] <= WIND_VALID_MAX)
  80. )
  81. d = df.loc[valid_mask, [COL_WIND_SPD, COL_P_ACTIVE]].copy()
  82. if d.empty:
  83. return out
  84. d["wind_bin"] = _bin_wind_speed(d[COL_WIND_SPD])
  85. bin_map = self.bin_stats.set_index("wind_bin")[["mean_power", "std_power"]]
  86. d["mean_power"] = d["wind_bin"].map(bin_map["mean_power"])
  87. d["std_power"] = d["wind_bin"].map(bin_map["std_power"])
  88. has_stat = d["mean_power"].notna() & d["std_power"].notna()
  89. d_stat = d.loc[has_stat].copy()
  90. z = (d_stat[COL_P_ACTIVE] - d_stat["mean_power"]) / d_stat["std_power"]
  91. sigma_arr = d_stat[COL_WIND_SPD].map(_get_sigma)
  92. out.loc[d_stat.index, "score"] = z.values
  93. out.loc[d_stat.index, "anomaly"] = (z.abs() > sigma_arr).values
  94. return out
  95. def save(self, path: Path):
  96. joblib.dump(self, path)
  97. @classmethod
  98. def load(cls, path: Path) -> "PowerCurveDetector":
  99. return joblib.load(path)
  100. # ── B. 散点检测器 ─────────────────────────────────────────────────────────────
  101. class ScatterDetector:
  102. """
  103. 对有效发电区间内的 (wind_spd, p_active) 点对做异常检测。
  104. 增加 p_active / wind_spd^3 特征(近似 Cp),捕捉风能利用率偏离。
  105. 可选: 如果数据中包含 ambient_temp 列,加入温度特征(空气密度影响功率)。
  106. """
  107. def __init__(self, contamination=ISO_CONTAMINATION):
  108. self.contamination = contamination
  109. self.scaler = StandardScaler()
  110. self.model = IsolationForest(
  111. n_estimators=ISO_N_ESTIMATORS,
  112. contamination=contamination,
  113. random_state=ISO_RANDOM_STATE,
  114. )
  115. self._has_temp = False
  116. def _features(self, df: pd.DataFrame) -> pd.DataFrame:
  117. cols = [COL_WIND_SPD, COL_P_ACTIVE]
  118. has_temp = COL_AMBIENT_TEMP in df.columns
  119. if has_temp:
  120. cols.append(COL_AMBIENT_TEMP)
  121. d = _filter_valid_wind(df[cols].copy()).dropna()
  122. wind3 = d[COL_WIND_SPD] ** 3
  123. d["cp_proxy"] = d[COL_P_ACTIVE] / wind3.replace(0, np.nan)
  124. if has_temp:
  125. # 温度越低空气密度越大,同风速下功率应更高
  126. d["temp"] = d[COL_AMBIENT_TEMP]
  127. return d.dropna()
  128. def fit(self, df: pd.DataFrame) -> "ScatterDetector":
  129. feat = self._features(df)
  130. if feat.empty:
  131. raise ValueError("散点特征为空,检查风速功率数据")
  132. self._has_temp = COL_AMBIENT_TEMP in feat.columns
  133. X = self.scaler.fit_transform(feat.select_dtypes(include=[np.number]))
  134. self.model.fit(X)
  135. return self
  136. def predict(self, df: pd.DataFrame) -> pd.DataFrame:
  137. out = pd.DataFrame({"anomaly": False, "score": np.nan}, index=df.index)
  138. feat = self._features(df)
  139. if feat.empty:
  140. return out
  141. X = self.scaler.transform(feat.select_dtypes(include=[np.number]))
  142. out.loc[feat.index, "anomaly"] = self.model.predict(X) == -1
  143. out.loc[feat.index, "score"] = self.model.score_samples(X)
  144. return out
  145. def save(self, path: Path):
  146. joblib.dump(self, path)
  147. @classmethod
  148. def load(cls, path: Path) -> "ScatterDetector":
  149. return joblib.load(path)