# @Time : 2019-6-19 # @Author : wilbur.cheng@SKF # @FileName: baseMSET.py # @Software: a package in skf anomaly detection library to realize multivariate state estimation technique # this package has included all the functions used in MSET, similarity calculation, state memory matrix D, # normal state matrix L, residual calculation, and the basic SPRT (Sequential Probability Ratio Test) # function for binary hypothesis testing. # import numpy as np import pandas as pd import matplotlib.pyplot as plt from sklearn.neighbors import BallTree import openpyxl import time #from scikit-learn.neighbors import BallTree class MSET_Temp(): matrixD = None matrixL = None healthyResidual = None normalDataBallTree = None def __init__(self): self.model = None def _get_by_id(self, table_name, ids): """Get data from MySQL database by IDs""" df_res = [] engine = create_engine('mysql+pymysql://root:admin123456@106.120.102.238:10336/energy_data_prod') for id in ids: table_name=windcode+'_minute' lastday_df_sql = f"SELECT * FROM {table_name} where id = {id}" df = pd.read_sql(lastday_df_sql, engine) df_res.append(df) return df_ress def calcSimilarity(self, x, y, m = 'euc'): """ Calcualte the similartity of two feature vector :param x: one of input feature list :param y: one of input feature list :param m: method of the similarity calculation method, default is the Eucilidean distance named 'euc'; a city block distance function is used when m set to "cbd'. :return: the two feature similarity, float, range (0,1) """ if (len(x) != len(y)): return 0 if (m == 'cbd'): dSimilarity = [1/(1+np.abs(p-q)) for p,q in zip(x,y)] dSimilarity = np.sum(dSimilarity)/len(x) else: dSimilarity = [np.power(p-q,2) for p, q in zip(x, y)] dSimilarity = 1/(1+np.sqrt(np.sum(dSimilarity))) return dSimilarity def genDLMatrix(self, trainDataset, dataSize4D=100, dataSize4L=50): """ automatically generate the D and L matrix from training data set, assuming the training data set is all normal state data. :param trainDataset: 2D array, [count of data, length of feature] :param dataSize4D: minimized count of state for matrix D :param dataSize4L: minimized count of state for matrix L :return: 0 if successful, -1 if fail """ [m,n] = np.shape(trainDataset) if m < dataSize4D + dataSize4L: print('training dataset is too small to generate the matrix D and L') return -1 self.matrixD = [] selectIndex4D = [] # Step 1: add all the state with minimum or maximum value in each feature into Matrix D for i in range(0, n): feature_i = trainDataset[:,i] minIndex = np.argmin(feature_i) maxIndex = np.argmax(feature_i) self.matrixD.append(trainDataset[minIndex, :].tolist()) selectIndex4D.append(minIndex) self.matrixD.append(trainDataset[maxIndex, :].tolist()) selectIndex4D.append(maxIndex) # Step 2: iteratively add the state with the maximum average distance to the states in selected matrixD while(1): if (len(selectIndex4D) >= dataSize4D): break # Get the free state list freeStateList = list(set(np.arange(0, len(trainDataset))) - set(selectIndex4D)) # Calculate the average dist of each state in free to selected state in matrix D distList = [] for i in freeStateList: tmpState = trainDataset[i] tmpDist = [1-self.calcSimilarity(x, tmpState) for x in self.matrixD] distList.append(np.mean(tmpDist)) # select the free state with largest average distance to states in matrixD, and update matrixD selectId = freeStateList[distList.index(np.max(distList))] self.matrixD.append(trainDataset[selectId, :].tolist()) selectIndex4D.append(selectId) #format matrixD from list to array self.matrixD = np.array(self.matrixD) self.normalDataBallTree = BallTree(self.matrixD, leaf_size=4, metric = lambda i,j: 1-self.calcSimilarity(i,j)) # Step 3. select remaining state for matrix L #index4L = list(set(np.arange(0, len(trainDataset))) - set(selectIndex4D)) #self.matrixL = trainDataset[index4L, :] # consider the limited dataset, using all the train data to matrix L self.matrixL = trainDataset # Calculate the healthy Residual from matrix L lamdaRatio = 1e-3 [m, n] = np.shape(self.matrixD) self.DDSimilarity = np.array([[1-self.calcSimilarity(x,y) for x in self.matrixD] for y in self.matrixD] + lamdaRatio*np.eye(n)) self.DDSimilarity = 1/self.DDSimilarity #self.healthyResidual = self.calcResidual(self.matrixL) self.healthyResidual = self.calcResidualByLocallyWeightedLR(self.matrixL) return 0 def calcResidualByLocallyWeightedLR(self, newStates): """ find the K-nearest neighbors for each input state, then calculate the estimated state by locally weighted average. :param newStates: input states list :return: residual R_x """ [m,n] = np.shape(newStates) est_X = [] # print(newStates) for x in newStates: (dist, iList) = self.normalDataBallTree.query([x], 20, return_distance=True) weight = 1/(dist[0]+1e-1) weight = weight/sum(weight) eState = np.sum([w*self.matrixD[i] for w,i in zip(weight, iList[0])]) est_X.append(eState) est_X = np.reshape(est_X, (len(est_X),1)) # print(est_X) # print(newStates) return est_X - newStates def calcStateResidual(self, newsStates): stateResidual = self.calcResidualByLocallyWeightedLR(newsStates) return stateResidual def calcSPRT(self, newsStates, feature_weight, alpha=0.1, beta=0.1, decisionGroup=5): """ anomaly detection based Wald's SPRT algorithm, refer to A.Wald, Sequential Analysis,Wiley, New York, NY, USA, 1947 :param newsStates: input states list :param feature_weight: the important weight for each feature, Normalized and Nonnegative :param alpha: prescribed false alarm rate, 0 < alpha < 1 :param beta: prescribed miss alarm rate, 0 < beta < 1 :param decisionGroup: length of the test sample when the decision is done :return: anomaly flag for each group of states, 1:anomaly, -1:normal, (-1:1): unable to decision """ # Step 1. transfer the raw residual vector to dimension reduced residual using feature weight #stateResidual = self.calcResidual(newsStates) stateResidual = self.calcResidualByLocallyWeightedLR(newsStates) # print(stateResidual) weightedStateResidual = [np.dot(x, feature_weight) for x in stateResidual] # print(weightedStateResidual) weightedHealthyResidual = [np.dot(x, feature_weight) for x in self.healthyResidual] ''' plt.subplot(211) plt.plot(weightedHealthyResidual) plt.xlabel('Sample index') plt.ylabel('Healthy State Residual') plt.subplot(212) plt.plot(weightedStateResidual) plt.xlabel('Sample index') plt.ylabel('All State Residual') plt.show() ''' # Calculate the distribution of health state residual mu0 = np.mean(weightedHealthyResidual) sigma0 = np.std(weightedHealthyResidual) #sigma1 = np.std(weightedStateResidual) lowThres = np.log(beta/(1-alpha)) highThres = np.log((1-beta)/alpha) flag = [] for i in range(0, len(newsStates)-decisionGroup+1): # For each group to calculate the new state residual distribution # Then check the hypothesis testing results mu1 = np.mean(weightedStateResidual[i:i+decisionGroup]) si = np.sum(weightedStateResidual[i:i+decisionGroup])*(mu1-mu0)/sigma0**2 - decisionGroup*(mu1**2-mu0**2)/(2*sigma0**2) if (si > highThres): si = highThres if (si < lowThres): si = lowThres if (si > 0): si = si/highThres if (si < 0): si = si/lowThres flag.append(si) return flag if __name__=='__main__': start_time = time.time() myMSET = MSET_Temp() # 1、计算各子系统的健康度(子系统包括:发电机组、传动系统(直驱机组无齿轮箱、无数据)、机舱系统、变流器系统、电网环境、辅助系统(无数据)) # 1.1、发电机组健康度评分 # Title = pd.read_excel(r'/Users/xmia/Documents/code/Temp_Diag/34_QITAIHE.xlsx', header=None, nrows=1, usecols=[12, 13, 14]) # df_D = pd.read_excel(r'/Users/xmia/Documents/code/Temp_Diag/34_QITAIHE.xlsx', # usecols=[0, 12, 13, 14], parse_dates=True) # 读取温度指标:齿轮箱油温12、驱动侧发电机轴承温度13、非驱动侧发电机轴承温度14 df = pd.read_csv('/Users/xmia/Desktop/ZN/华电中光/471_QTH1125/2024/#34.csv', usecols=[ 'wind_turbine_number', 'time_stamp', 'main_bearing_temperature', 'gearbox_oil_temperature', 'generatordrive_end_bearing_temperature', 'generatornon_drive_end_bearing_temperature' ], parse_dates=['time_stamp']) df = df[df['wind_turbine_number'] == 'WOG01312'] df = df.drop(columns=['wind_turbine_number']) df_D = df[df['time_stamp'] > '2024-11-15'] cols = df_D.columns df_D[cols] = df_D[cols].apply(pd.to_numeric, errors='coerce') df_D = df_D.dropna() # 删除空行和非数字项 x_date = df_D.iloc[len(df_D) // 2 + 1:, 0] #获取时间序列 x_date = pd.to_datetime(x_date) df_Temp = df_D.iloc[:, 1:] #获取除时间列以外的数据 df_Temp_values = df_Temp.values df_Temp_values = np.array(df_Temp_values) [m, n] = np.shape(df_Temp_values) # Residual = [] flag_Spart_data = [] for i in range(0, n): df_Temp_i = df_Temp_values[:, i] trainDataSet_data = df_Temp_i[0:len(df_Temp_i) // 2] testDataSet_data = df_Temp_i[len(df_Temp_i) // 2 + 1:] trainDataSet_data = np.reshape(trainDataSet_data, (len(trainDataSet_data), 1)) testDataSet_data = np.reshape(testDataSet_data, (len(testDataSet_data), 1)) myMSET.genDLMatrix(trainDataSet_data, 60, 5) # stateResidual = self.calcStateResidual(testDataSet_data) flag_data = myMSET.calcSPRT(testDataSet_data, np.array(1), decisionGroup=1) # Residual.append(stateResidual) flag_Spart_data.append(flag_data) flag_Spart_data = np.array(flag_Spart_data) flag_Spart_data = flag_Spart_data.T Temp1 = flag_Spart_data[:,0] Temp2 = flag_Spart_data[:,1] Temp3 = flag_Spart_data[:,2] Temp1_lable = "gearbox_oil_temperature" Temp2_lable = "generatordrive_end_bearing_temperature" Temp3_lable = "generatornon_drive_end_bearing_temperature" print(x_date) # alarmedFlag = np.array([[i, Temp1[i]] for i, x in enumerate(Temp1) if x > 0.99]) # Flag中选出大于0.99的点 plt.rcParams['font.sans-serif'] = ['SimHei'] # 显示中文标签 plt.close('all') plt.subplot(311) plt.plot(x_date, Temp1, 'b-o', label=Temp1_lable) plt.ylabel(Temp1_lable) plt.xlabel('时间') # plt.scatter(alarmedFlag1[:, 0], alarmedFlag1[:, 2], marker='x', c='r') plt.subplot(312) plt.plot(x_date, Temp2) plt.ylabel(Temp2_lable) plt.xlabel('时间') #plt.scatter(alarmedFlag[:, 0], alarmedFlag[:, 1], marker='x', c='r') plt.subplot(313) plt.plot(x_date, Temp3) plt.ylabel(Temp3_lable) plt.xlabel('时间') #plt.scatter(alarmedFlag[:, 0], alarmedFlag[:, 1], marker='x', c='r') plt.show() print(flag_Spart_data) print(np.shape(flag_Spart_data)) end_time = time.time() execution_time = end_time - start_time print(f"Execution time: {execution_time} seconds") # ''' f = open("speed_vib.txt", "r") data1 = f.read() f.close() data1 = data1.split('\n') rpm = [(row.split('\t')[0]).strip() for row in data1] vib = [(row.split('\t')[-1]).strip() for row in data1] # print(rpm) rpm = np.array(rpm).astype(np.float64) vib = np.array(vib).astype(np.float64) #vib = [(x-np.mean(vib))/np.std(vib) for x in vib] #print(vib) trainDataSet = [vib[i] for i in range(0,100) if vib[i] < 5] trainDataSet = np.reshape(trainDataSet,(len(trainDataSet),1)) testDataSet = np.reshape(vib, (len(vib),1)) ''' # Title = pd.read_csv(r'F1710001001.csv', header=None, nrows=1, usecols=[36,37,38], encoding='gbk') ''' alarmedFlag = np.array([[i, flag[i]] for i, x in enumerate(flag) if x > 0.99]) # Flag中选出大于0.99的点 plt.close('all') plt.subplot(211) plt.plot(testDataSet) plt.ylabel('Vibration') plt.xlabel('Sample index') plt.subplot(212) plt.plot(flag) plt.ylabel('SPRT results') plt.xlabel('Sample index') plt.scatter(alarmedFlag[:,0], alarmedFlag[:,1], marker='x',c='r') plt.show() '''