Temp_Diag.PY 13 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357
  1. # @Time : 2019-6-19
  2. # @Author : wilbur.cheng@SKF
  3. # @FileName: baseMSET.py
  4. # @Software: a package in skf anomaly detection library to realize multivariate state estimation technique
  5. # this package has included all the functions used in MSET, similarity calculation, state memory matrix D,
  6. # normal state matrix L, residual calculation, and the basic SPRT (Sequential Probability Ratio Test)
  7. # function for binary hypothesis testing.
  8. #
  9. import numpy as np
  10. import pandas as pd
  11. import matplotlib.pyplot as plt
  12. from sklearn.neighbors import BallTree
  13. import openpyxl
  14. import time
  15. #from scikit-learn.neighbors import BallTree
  16. class MSET_Temp():
  17. matrixD = None
  18. matrixL = None
  19. healthyResidual = None
  20. normalDataBallTree = None
  21. def __init__(self):
  22. self.model = None
  23. def _get_by_id(self, table_name, ids):
  24. """Get data from MySQL database by IDs"""
  25. df_res = []
  26. engine = create_engine('mysql+pymysql://root:admin123456@106.120.102.238:10336/energy_data_prod')
  27. for id in ids:
  28. table_name=windcode+'_minute'
  29. lastday_df_sql = f"SELECT * FROM {table_name} where id = {id}"
  30. df = pd.read_sql(lastday_df_sql, engine)
  31. df_res.append(df)
  32. return df_ress
  33. def calcSimilarity(self, x, y, m = 'euc'):
  34. """
  35. Calcualte the similartity of two feature vector
  36. :param x: one of input feature list
  37. :param y: one of input feature list
  38. :param m: method of the similarity calculation method, default is the Eucilidean distance named 'euc';
  39. a city block distance function is used when m set to "cbd'.
  40. :return: the two feature similarity, float, range (0,1)
  41. """
  42. if (len(x) != len(y)):
  43. return 0
  44. if (m == 'cbd'):
  45. dSimilarity = [1/(1+np.abs(p-q)) for p,q in zip(x,y)]
  46. dSimilarity = np.sum(dSimilarity)/len(x)
  47. else:
  48. dSimilarity = [np.power(p-q,2) for p, q in zip(x, y)]
  49. dSimilarity = 1/(1+np.sqrt(np.sum(dSimilarity)))
  50. return dSimilarity
  51. def genDLMatrix(self, trainDataset, dataSize4D=100, dataSize4L=50):
  52. """
  53. automatically generate the D and L matrix from training data set, assuming the training data set is all normal
  54. state data.
  55. :param trainDataset: 2D array, [count of data, length of feature]
  56. :param dataSize4D: minimized count of state for matrix D
  57. :param dataSize4L: minimized count of state for matrix L
  58. :return: 0 if successful, -1 if fail
  59. """
  60. [m,n] = np.shape(trainDataset)
  61. if m < dataSize4D + dataSize4L:
  62. print('training dataset is too small to generate the matrix D and L')
  63. return -1
  64. self.matrixD = []
  65. selectIndex4D = []
  66. # Step 1: add all the state with minimum or maximum value in each feature into Matrix D
  67. for i in range(0, n):
  68. feature_i = trainDataset[:,i]
  69. minIndex = np.argmin(feature_i)
  70. maxIndex = np.argmax(feature_i)
  71. self.matrixD.append(trainDataset[minIndex, :].tolist())
  72. selectIndex4D.append(minIndex)
  73. self.matrixD.append(trainDataset[maxIndex, :].tolist())
  74. selectIndex4D.append(maxIndex)
  75. # Step 2: iteratively add the state with the maximum average distance to the states in selected matrixD
  76. while(1):
  77. if (len(selectIndex4D) >= dataSize4D):
  78. break
  79. # Get the free state list
  80. freeStateList = list(set(np.arange(0, len(trainDataset))) - set(selectIndex4D))
  81. # Calculate the average dist of each state in free to selected state in matrix D
  82. distList = []
  83. for i in freeStateList:
  84. tmpState = trainDataset[i]
  85. tmpDist = [1-self.calcSimilarity(x, tmpState) for x in self.matrixD]
  86. distList.append(np.mean(tmpDist))
  87. # select the free state with largest average distance to states in matrixD, and update matrixD
  88. selectId = freeStateList[distList.index(np.max(distList))]
  89. self.matrixD.append(trainDataset[selectId, :].tolist())
  90. selectIndex4D.append(selectId)
  91. #format matrixD from list to array
  92. self.matrixD = np.array(self.matrixD)
  93. self.normalDataBallTree = BallTree(self.matrixD, leaf_size=4, metric = lambda i,j: 1-self.calcSimilarity(i,j))
  94. # Step 3. select remaining state for matrix L
  95. #index4L = list(set(np.arange(0, len(trainDataset))) - set(selectIndex4D))
  96. #self.matrixL = trainDataset[index4L, :]
  97. # consider the limited dataset, using all the train data to matrix L
  98. self.matrixL = trainDataset
  99. # Calculate the healthy Residual from matrix L
  100. lamdaRatio = 1e-3
  101. [m, n] = np.shape(self.matrixD)
  102. self.DDSimilarity = np.array([[1-self.calcSimilarity(x,y) for x in self.matrixD] for y in self.matrixD] + lamdaRatio*np.eye(n))
  103. self.DDSimilarity = 1/self.DDSimilarity
  104. #self.healthyResidual = self.calcResidual(self.matrixL)
  105. self.healthyResidual = self.calcResidualByLocallyWeightedLR(self.matrixL)
  106. return 0
  107. def calcResidualByLocallyWeightedLR(self, newStates):
  108. """
  109. find the K-nearest neighbors for each input state, then calculate the estimated state by locally weighted average.
  110. :param newStates: input states list
  111. :return: residual R_x
  112. """
  113. [m,n] = np.shape(newStates)
  114. est_X = []
  115. # print(newStates)
  116. for x in newStates:
  117. (dist, iList) = self.normalDataBallTree.query([x], 20, return_distance=True)
  118. weight = 1/(dist[0]+1e-1)
  119. weight = weight/sum(weight)
  120. eState = np.sum([w*self.matrixD[i] for w,i in zip(weight, iList[0])])
  121. est_X.append(eState)
  122. est_X = np.reshape(est_X, (len(est_X),1))
  123. # print(est_X)
  124. # print(newStates)
  125. return est_X - newStates
  126. def calcStateResidual(self, newsStates):
  127. stateResidual = self.calcResidualByLocallyWeightedLR(newsStates)
  128. return stateResidual
  129. def calcSPRT(self, newsStates, feature_weight, alpha=0.1, beta=0.1, decisionGroup=5):
  130. """
  131. anomaly detection based Wald's SPRT algorithm, refer to A.Wald, Sequential Analysis,Wiley, New York, NY, USA, 1947
  132. :param newsStates: input states list
  133. :param feature_weight: the important weight for each feature, Normalized and Nonnegative
  134. :param alpha: prescribed false alarm rate, 0 < alpha < 1
  135. :param beta: prescribed miss alarm rate, 0 < beta < 1
  136. :param decisionGroup: length of the test sample when the decision is done
  137. :return: anomaly flag for each group of states, 1:anomaly, -1:normal, (-1:1): unable to decision
  138. """
  139. # Step 1. transfer the raw residual vector to dimension reduced residual using feature weight
  140. #stateResidual = self.calcResidual(newsStates)
  141. stateResidual = self.calcResidualByLocallyWeightedLR(newsStates)
  142. # print(stateResidual)
  143. weightedStateResidual = [np.dot(x, feature_weight) for x in stateResidual]
  144. # print(weightedStateResidual)
  145. weightedHealthyResidual = [np.dot(x, feature_weight) for x in self.healthyResidual]
  146. '''
  147. plt.subplot(211)
  148. plt.plot(weightedHealthyResidual)
  149. plt.xlabel('Sample index')
  150. plt.ylabel('Healthy State Residual')
  151. plt.subplot(212)
  152. plt.plot(weightedStateResidual)
  153. plt.xlabel('Sample index')
  154. plt.ylabel('All State Residual')
  155. plt.show()
  156. '''
  157. # Calculate the distribution of health state residual
  158. mu0 = np.mean(weightedHealthyResidual)
  159. sigma0 = np.std(weightedHealthyResidual)
  160. #sigma1 = np.std(weightedStateResidual)
  161. lowThres = np.log(beta/(1-alpha))
  162. highThres = np.log((1-beta)/alpha)
  163. flag = []
  164. for i in range(0, len(newsStates)-decisionGroup+1):
  165. # For each group to calculate the new state residual distribution
  166. # Then check the hypothesis testing results
  167. mu1 = np.mean(weightedStateResidual[i:i+decisionGroup])
  168. si = np.sum(weightedStateResidual[i:i+decisionGroup])*(mu1-mu0)/sigma0**2 - decisionGroup*(mu1**2-mu0**2)/(2*sigma0**2)
  169. if (si > highThres):
  170. si = highThres
  171. if (si < lowThres):
  172. si = lowThres
  173. if (si > 0):
  174. si = si/highThres
  175. if (si < 0):
  176. si = si/lowThres
  177. flag.append(si)
  178. return flag
  179. if __name__=='__main__':
  180. start_time = time.time()
  181. myMSET = MSET_Temp()
  182. # 1、计算各子系统的健康度(子系统包括:发电机组、传动系统(直驱机组无齿轮箱、无数据)、机舱系统、变流器系统、电网环境、辅助系统(无数据))
  183. # 1.1、发电机组健康度评分
  184. # Title = pd.read_excel(r'/Users/xmia/Documents/code/Temp_Diag/34_QITAIHE.xlsx', header=None, nrows=1, usecols=[12, 13, 14])
  185. # df_D = pd.read_excel(r'/Users/xmia/Documents/code/Temp_Diag/34_QITAIHE.xlsx',
  186. # usecols=[0, 12, 13, 14], parse_dates=True) # 读取温度指标:齿轮箱油温12、驱动侧发电机轴承温度13、非驱动侧发电机轴承温度14
  187. df = pd.read_csv('/Users/xmia/Desktop/ZN/华电中光/471_QTH1125/2024/#34.csv', usecols=[
  188. 'wind_turbine_number',
  189. 'time_stamp',
  190. 'main_bearing_temperature',
  191. 'gearbox_oil_temperature',
  192. 'generatordrive_end_bearing_temperature',
  193. 'generatornon_drive_end_bearing_temperature'
  194. ], parse_dates=['time_stamp'])
  195. df = df[df['wind_turbine_number'] == 'WOG01312']
  196. df = df.drop(columns=['wind_turbine_number'])
  197. df_D = df[df['time_stamp'] > '2024-11-15']
  198. cols = df_D.columns
  199. df_D[cols] = df_D[cols].apply(pd.to_numeric, errors='coerce')
  200. df_D = df_D.dropna() # 删除空行和非数字项
  201. x_date = df_D.iloc[len(df_D) // 2 + 1:, 0] #获取时间序列
  202. x_date = pd.to_datetime(x_date)
  203. df_Temp = df_D.iloc[:, 1:] #获取除时间列以外的数据
  204. df_Temp_values = df_Temp.values
  205. df_Temp_values = np.array(df_Temp_values)
  206. [m, n] = np.shape(df_Temp_values)
  207. # Residual = []
  208. flag_Spart_data = []
  209. for i in range(0, n):
  210. df_Temp_i = df_Temp_values[:, i]
  211. trainDataSet_data = df_Temp_i[0:len(df_Temp_i) // 2]
  212. testDataSet_data = df_Temp_i[len(df_Temp_i) // 2 + 1:]
  213. trainDataSet_data = np.reshape(trainDataSet_data, (len(trainDataSet_data), 1))
  214. testDataSet_data = np.reshape(testDataSet_data, (len(testDataSet_data), 1))
  215. myMSET.genDLMatrix(trainDataSet_data, 60, 5)
  216. # stateResidual = self.calcStateResidual(testDataSet_data)
  217. flag_data = myMSET.calcSPRT(testDataSet_data, np.array(1), decisionGroup=1)
  218. # Residual.append(stateResidual)
  219. flag_Spart_data.append(flag_data)
  220. flag_Spart_data = np.array(flag_Spart_data)
  221. flag_Spart_data = flag_Spart_data.T
  222. Temp1 = flag_Spart_data[:,0]
  223. Temp2 = flag_Spart_data[:,1]
  224. Temp3 = flag_Spart_data[:,2]
  225. Temp1_lable = "gearbox_oil_temperature"
  226. Temp2_lable = "generatordrive_end_bearing_temperature"
  227. Temp3_lable = "generatornon_drive_end_bearing_temperature"
  228. print(x_date)
  229. # alarmedFlag = np.array([[i, Temp1[i]] for i, x in enumerate(Temp1) if x > 0.99]) # Flag中选出大于0.99的点
  230. plt.rcParams['font.sans-serif'] = ['SimHei'] # 显示中文标签
  231. plt.close('all')
  232. plt.subplot(311)
  233. plt.plot(x_date, Temp1, 'b-o', label=Temp1_lable)
  234. plt.ylabel(Temp1_lable)
  235. plt.xlabel('时间')
  236. # plt.scatter(alarmedFlag1[:, 0], alarmedFlag1[:, 2], marker='x', c='r')
  237. plt.subplot(312)
  238. plt.plot(x_date, Temp2)
  239. plt.ylabel(Temp2_lable)
  240. plt.xlabel('时间')
  241. #plt.scatter(alarmedFlag[:, 0], alarmedFlag[:, 1], marker='x', c='r')
  242. plt.subplot(313)
  243. plt.plot(x_date, Temp3)
  244. plt.ylabel(Temp3_lable)
  245. plt.xlabel('时间')
  246. #plt.scatter(alarmedFlag[:, 0], alarmedFlag[:, 1], marker='x', c='r')
  247. plt.show()
  248. print(flag_Spart_data)
  249. print(np.shape(flag_Spart_data))
  250. end_time = time.time()
  251. execution_time = end_time - start_time
  252. print(f"Execution time: {execution_time} seconds")
  253. #
  254. '''
  255. f = open("speed_vib.txt", "r")
  256. data1 = f.read()
  257. f.close()
  258. data1 = data1.split('\n')
  259. rpm = [(row.split('\t')[0]).strip() for row in data1]
  260. vib = [(row.split('\t')[-1]).strip() for row in data1]
  261. # print(rpm)
  262. rpm = np.array(rpm).astype(np.float64)
  263. vib = np.array(vib).astype(np.float64)
  264. #vib = [(x-np.mean(vib))/np.std(vib) for x in vib]
  265. #print(vib)
  266. trainDataSet = [vib[i] for i in range(0,100) if vib[i] < 5]
  267. trainDataSet = np.reshape(trainDataSet,(len(trainDataSet),1))
  268. testDataSet = np.reshape(vib, (len(vib),1))
  269. '''
  270. # Title = pd.read_csv(r'F1710001001.csv', header=None, nrows=1, usecols=[36,37,38], encoding='gbk')
  271. '''
  272. alarmedFlag = np.array([[i, flag[i]] for i, x in enumerate(flag) if x > 0.99]) # Flag中选出大于0.99的点
  273. plt.close('all')
  274. plt.subplot(211)
  275. plt.plot(testDataSet)
  276. plt.ylabel('Vibration')
  277. plt.xlabel('Sample index')
  278. plt.subplot(212)
  279. plt.plot(flag)
  280. plt.ylabel('SPRT results')
  281. plt.xlabel('Sample index')
  282. plt.scatter(alarmedFlag[:,0], alarmedFlag[:,1], marker='x',c='r')
  283. plt.show()
  284. '''