123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193 |
- import os
- import pandas as pd
- import numpy as np
- import matplotlib.pyplot as plt
- from matplotlib.pyplot import MultipleLocator
- import math
- intervalPower = 25 # For example
- intervalWindspeed = 0.25 # For example
- fieldRatedPower="额定功率"
- fieldRatedWindSpeed="额定风速"
- fieldWindSpeedCutIn="切入风速"
- fieldWindSpeedCutOut="切出风速"
- fieldTime="时间"
- fieldWindSpeed="风速"
- fieldActivePower="变频器电网侧有功功率"
- fieldLabel="lab"
- # 1. 数据加载和预处理函数
- def loadData(filePathSCADA:str, filePathTurbineInfo:str):
- dataFrameSCADA = pd.read_csv(filePathSCADA, encoding="utf-8")
- dataFrameTurbineInfo = pd.read_csv(filePathTurbineInfo)
- return dataFrameSCADA, dataFrameTurbineInfo
- def extractTurbineParameters(turbineInfo:pd.DataFrame):
- """
- 解析风电机组参数
- 参数:
- turbineInfo 风电机组信息DataFrame
- 返回:
- PRated 额定功率(kw)
- VCutOut 切出风速(m/s)
- VCutIn 切入风速(m/s)
- VRated 额定风速(m/s)
- """
- ratedPower = turbineInfo.loc[:, [fieldRatedPower]].values
- windSpeedCutIn = turbineInfo.loc[:, [fieldWindSpeedCutIn]].values
- windSpeedCutOut = turbineInfo.loc[:, [fieldWindSpeedCutOut]].values
- ratedWindSpeed = turbineInfo.loc[:, [fieldRatedWindSpeed]].values
- return ratedPower, windSpeedCutOut, windSpeedCutIn, ratedWindSpeed
- def preprocessData(dataFrameOfSCADA:pd.DataFrame):
- """
- 获取机组SCADA数据的 时间、有功功率、风速,构建新的DataFrame变量
- 参数:
- dataFrameOfSCADA 机组SCADA数据
- 返回:
- 由机组SCADA数据的 时间、有功功率、风速,构建新的DataFrame变量
- """
- timeStamp = dataFrameOfSCADA.loc[:, ['时间']]
- activePower = dataFrameOfSCADA.loc[:, ['变频器电网侧有功功率']]
- windSpeed = dataFrameOfSCADA.loc[:, ['风速']]
- dataFramePartOfSCADA = pd.concat([timeStamp, activePower, windSpeed], axis=1)
- dataFramePartOfSCADA[fieldLabel]=0
- dataFramePartOfSCADA[fieldLabel]=dataFramePartOfSCADA[fieldLabel].astype(int)
- return dataFramePartOfSCADA
- # 2. 数据标签分配和分箱计算
- def calculateIntervals(activePowerMax, ratedPower, windSpeedCutOut):
- """
- 按有功功率(以25kw为间隔)、风速(以0.25m/s为间隔)分仓
- 参数:
- max_power 当前机组的有功功率最大值
- PRated 机组额定功率
- wind_speed_cutout 切出风速
- 返回:
- interval_power 有功功率分仓间隔
- interval_windspeed 风速分仓间隔
- PNum 有功功率分仓数量
- VNum 风速分仓数量
- """
- binNumOfPower = math.floor(activePowerMax / intervalPower) + 1 if activePowerMax >= ratedPower else math.floor(ratedPower / intervalPower)
- binNumOfWindSpeed = math.ceil(windSpeedCutOut / intervalWindspeed)
- return binNumOfPower, binNumOfWindSpeed
- def labelData(dataFramePartOfSCADA:pd.DataFrame, conditions):
- """
- 根据特定条件对数据进行标签分配,例如功率和风速阈值。
-
- 参数:
- LM (DataFrame): 包含功率和风速数据的DataFrame。
- conditions (dict): 字典,键为条件名称,值为相应的阈值。
-
- 返回:
- DataFrame: 带有新的'label'列的原始DataFrame。
- """
- # 初始化标签列
- dataFramePartOfSCADA['label'] = 0
-
- # 根据条件进行数据标签分配
- for condition, threshold in conditions.items():
- if condition == 'power_below':
- dataFramePartOfSCADA.loc[dataFramePartOfSCADA[fieldActivePower] <= threshold, 'label'] = -1
- elif condition == 'power_above':
- dataFramePartOfSCADA.loc[dataFramePartOfSCADA[fieldActivePower] >= threshold, 'label'] = 1
-
- return dataFramePartOfSCADA
- def computeBins(data, intervals):
- """为给定数据计算统计箱。
-
- 参数:
- data (DataFrame): 需要进行分箱的数据。
- intervals (dict): 字典,为每个列指定间隔大小。
-
- 返回:
- DataFrame: 分箱数据作为区间内的计数或百分比。
- """
- binsResults = {}
- for column, interval in intervals.items():
- minValue = data[column].min()
- maxValue = data[column].max()
- bins = np.arange(minValue, maxValue + interval, interval)
- binnedData = pd.cut(data[column], bins, include_lowest=True)
- binCounts = pd.value_counts(binnedData, sort=False)
- binsResults[column] = binCounts
-
- return pd.DataFrame(binsResults)
- # 3. 应用标签函数
- def applyLabels(data, labels):
- """根据外部或计算出的标签对数据应用标签。
-
- 参数:
- data (DataFrame): 需要应用标签的数据。
- labels (Series或array): 应用的标签;必须与数据的索引或长度相匹配。
-
- 返回:
- DataFrame: 应用标签后的数据。
- """
- data['label'] = labels
- return data
- # 4. 数据可视化
- def plot_data(ws:list, ap:list):
- fig = plt.figure()
- plt.scatter(ws, ap, s=1, c='black', marker='.')
- ax = plt.gca()
- ax.xaxis.set_major_locator(MultipleLocator(5))
- ax.yaxis.set_major_locator(MultipleLocator(500))
- plt.xlim((0, 30))
- plt.ylim((0, 2200))
- plt.tick_params(labelsize=8)
- plt.xlabel("V/(m$·$s$^{-1}$)", fontsize=8)
- plt.ylabel("P/kW", fontsize=8)
- plt.show()
- # 5. Main Execution
- def main():
- turbine=82
- filePathSCADA = r'E:\BaiduNetdiskDownload\test\min_scada_LuoTuoGou\72\{}.csv'.format(turbine)
- filePathTurbineInfo = r'E:\BaiduNetdiskDownload\test\min_scada_LuoTuoGou\72\info.csv'
- outputFilePathOfSCADA=r"E:\BaiduNetdiskDownload\test\min_scada_LuoTuoGou\72\labeled\labeled_{}.csv".format(turbine)
- dataFrameOfSCADA, turbineInfo = loadData(filePathSCADA, filePathTurbineInfo)
- ratedPower, windSpeedCutOut, windSpeedCutIn, ratedWindSpeed = extractTurbineParameters(turbineInfo)
- dataFramePartOfSCADA = preprocessData(dataFrameOfSCADA)
- powerMax=dataFramePartOfSCADA[fieldActivePower].max()
- binNumOfPower, binNumOfWindSpeed=calculateIntervals(powerMax,ratedPower,windSpeedCutOut)
-
- # 根据功率阈值对数据进行标签分配
- conditions = {'power_below': 10, 'power_above': ratedPower[0][0]}
- labeledData = labelData(dataFramePartOfSCADA, conditions)
-
- # 为功率和风速计算分箱
- intervals = {fieldActivePower: 100, fieldWindSpeed: 1}
- binnedData = computeBins(labeledData, intervals)
-
- # 应用标签(假设某些外部标签被提供或在其他地方计算)
- externalLabels = np.random.choice([0, 1], size=len(labeledData)) # 随机示例
- labeledData = applyLabels(labeledData, externalLabels)
- labeledData.to_csv(outputFilePathOfSCADA)
-
- plot_data(labeledData[fieldWindSpeed], labeledData[fieldActivePower])
- if __name__ == '__main__':
- main()
|