import os import pandas as pd import numpy as np import matplotlib.pyplot as plt from matplotlib.pyplot import MultipleLocator import math intervalPower = 25 # For example intervalWindspeed = 0.25 # For example fieldRatedPower="额定功率" fieldRatedWindSpeed="额定风速" fieldWindSpeedCutIn="切入风速" fieldWindSpeedCutOut="切出风速" fieldTime="时间" fieldWindSpeed="风速" fieldActivePower="变频器电网侧有功功率" fieldLabel="lab" # 1. 数据加载和预处理函数 def loadData(filePathSCADA:str, filePathTurbineInfo:str): dataFrameSCADA = pd.read_csv(filePathSCADA, encoding="utf-8") dataFrameTurbineInfo = pd.read_csv(filePathTurbineInfo) return dataFrameSCADA, dataFrameTurbineInfo def extractTurbineParameters(turbineInfo:pd.DataFrame): """ 解析风电机组参数 参数: turbineInfo 风电机组信息DataFrame 返回: PRated 额定功率(kw) VCutOut 切出风速(m/s) VCutIn 切入风速(m/s) VRated 额定风速(m/s) """ ratedPower = turbineInfo.loc[:, [fieldRatedPower]].values windSpeedCutIn = turbineInfo.loc[:, [fieldWindSpeedCutIn]].values windSpeedCutOut = turbineInfo.loc[:, [fieldWindSpeedCutOut]].values ratedWindSpeed = turbineInfo.loc[:, [fieldRatedWindSpeed]].values return ratedPower, windSpeedCutOut, windSpeedCutIn, ratedWindSpeed def preprocessData(dataFrameOfSCADA:pd.DataFrame): """ 获取机组SCADA数据的 时间、有功功率、风速,构建新的DataFrame变量 参数: dataFrameOfSCADA 机组SCADA数据 返回: 由机组SCADA数据的 时间、有功功率、风速,构建新的DataFrame变量 """ timeStamp = dataFrameOfSCADA.loc[:, ['时间']] activePower = dataFrameOfSCADA.loc[:, ['变频器电网侧有功功率']] windSpeed = dataFrameOfSCADA.loc[:, ['风速']] dataFramePartOfSCADA = pd.concat([timeStamp, activePower, windSpeed], axis=1) dataFramePartOfSCADA[fieldLabel]=0 dataFramePartOfSCADA[fieldLabel]=dataFramePartOfSCADA[fieldLabel].astype(int) return dataFramePartOfSCADA # 2. 数据标签分配和分箱计算 def calculateIntervals(activePowerMax, ratedPower, windSpeedCutOut): """ 按有功功率(以25kw为间隔)、风速(以0.25m/s为间隔)分仓 参数: max_power 当前机组的有功功率最大值 PRated 机组额定功率 wind_speed_cutout 切出风速 返回: interval_power 有功功率分仓间隔 interval_windspeed 风速分仓间隔 PNum 有功功率分仓数量 VNum 风速分仓数量 """ binNumOfPower = math.floor(activePowerMax / intervalPower) + 1 if activePowerMax >= ratedPower else math.floor(ratedPower / intervalPower) binNumOfWindSpeed = math.ceil(windSpeedCutOut / intervalWindspeed) return binNumOfPower, binNumOfWindSpeed def labelData(dataFramePartOfSCADA:pd.DataFrame, conditions): """ 根据特定条件对数据进行标签分配,例如功率和风速阈值。 参数: LM (DataFrame): 包含功率和风速数据的DataFrame。 conditions (dict): 字典,键为条件名称,值为相应的阈值。 返回: DataFrame: 带有新的'label'列的原始DataFrame。 """ # 初始化标签列 dataFramePartOfSCADA['label'] = 0 # 根据条件进行数据标签分配 for condition, threshold in conditions.items(): if condition == 'power_below': dataFramePartOfSCADA.loc[dataFramePartOfSCADA[fieldActivePower] <= threshold, 'label'] = -1 elif condition == 'power_above': dataFramePartOfSCADA.loc[dataFramePartOfSCADA[fieldActivePower] >= threshold, 'label'] = 1 return dataFramePartOfSCADA def computeBins(data, intervals): """为给定数据计算统计箱。 参数: data (DataFrame): 需要进行分箱的数据。 intervals (dict): 字典,为每个列指定间隔大小。 返回: DataFrame: 分箱数据作为区间内的计数或百分比。 """ binsResults = {} for column, interval in intervals.items(): minValue = data[column].min() maxValue = data[column].max() bins = np.arange(minValue, maxValue + interval, interval) binnedData = pd.cut(data[column], bins, include_lowest=True) binCounts = pd.value_counts(binnedData, sort=False) binsResults[column] = binCounts return pd.DataFrame(binsResults) # 3. 应用标签函数 def applyLabels(data, labels): """根据外部或计算出的标签对数据应用标签。 参数: data (DataFrame): 需要应用标签的数据。 labels (Series或array): 应用的标签;必须与数据的索引或长度相匹配。 返回: DataFrame: 应用标签后的数据。 """ data['label'] = labels return data # 4. 数据可视化 def plot_data(ws:list, ap:list): fig = plt.figure() plt.scatter(ws, ap, s=1, c='black', marker='.') ax = plt.gca() ax.xaxis.set_major_locator(MultipleLocator(5)) ax.yaxis.set_major_locator(MultipleLocator(500)) plt.xlim((0, 30)) plt.ylim((0, 2200)) plt.tick_params(labelsize=8) plt.xlabel("V/(m$·$s$^{-1}$)", fontsize=8) plt.ylabel("P/kW", fontsize=8) plt.show() # 5. Main Execution def main(): turbine=82 filePathSCADA = r'E:\BaiduNetdiskDownload\test\min_scada_LuoTuoGou\72\{}.csv'.format(turbine) filePathTurbineInfo = r'E:\BaiduNetdiskDownload\test\min_scada_LuoTuoGou\72\info.csv' outputFilePathOfSCADA=r"E:\BaiduNetdiskDownload\test\min_scada_LuoTuoGou\72\labeled\labeled_{}.csv".format(turbine) dataFrameOfSCADA, turbineInfo = loadData(filePathSCADA, filePathTurbineInfo) ratedPower, windSpeedCutOut, windSpeedCutIn, ratedWindSpeed = extractTurbineParameters(turbineInfo) dataFramePartOfSCADA = preprocessData(dataFrameOfSCADA) powerMax=dataFramePartOfSCADA[fieldActivePower].max() binNumOfPower, binNumOfWindSpeed=calculateIntervals(powerMax,ratedPower,windSpeedCutOut) # 根据功率阈值对数据进行标签分配 conditions = {'power_below': 10, 'power_above': ratedPower[0][0]} labeledData = labelData(dataFramePartOfSCADA, conditions) # 为功率和风速计算分箱 intervals = {fieldActivePower: 100, fieldWindSpeed: 1} binnedData = computeBins(labeledData, intervals) # 应用标签(假设某些外部标签被提供或在其他地方计算) externalLabels = np.random.choice([0, 1], size=len(labeledData)) # 随机示例 labeledData = applyLabels(labeledData, externalLabels) labeledData.to_csv(outputFilePathOfSCADA) plot_data(labeledData[fieldWindSpeed], labeledData[fieldActivePower]) if __name__ == '__main__': main()