balltreeMSET.PY 16 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418
  1. # @Time : 2019-6-19
  2. # @Author : wilbur.cheng@SKF
  3. # @FileName: baseMSET.py
  4. # @Software: a package in skf anomaly detection library to realize multivariate state estimation technique
  5. # this package has included all the functions used in MSET, similarity calculation, state memory matrix D,
  6. # normal state matrix L, residual calculation, and the basic SPRT (Sequential Probability Ratio Test)
  7. # function for binary hypothesis testing.
  8. #
  9. import numpy as np
  10. import pandas as pd
  11. import matplotlib.pyplot as plt
  12. from sklearn.neighbors import BallTree
  13. import openpyxl
  14. import time
  15. #from scikit-learn.neighbors import BallTree
  16. class MSET():
  17. matrixD = None
  18. matrixL = None
  19. healthyResidual = None
  20. normalDataBallTree = None
  21. def __init__(self):
  22. self.model = None
  23. def calcSimilarity(self, x, y, m = 'euc'):
  24. """
  25. Calcualte the similartity of two feature vector
  26. :param x: one of input feature list
  27. :param y: one of input feature list
  28. :param m: method of the similarity calculation method, default is the Eucilidean distance named 'euc';
  29. a city block distance function is used when m set to "cbd'.
  30. :return: the two feature similarity, float, range (0,1)
  31. """
  32. if (len(x) != len(y)):
  33. return 0
  34. if (m == 'cbd'):
  35. dSimilarity = [1/(1+np.abs(p-q)) for p,q in zip(x,y)]
  36. dSimilarity = np.sum(dSimilarity)/len(x)
  37. else:
  38. dSimilarity = [np.power(p-q,2) for p, q in zip(x, y)]
  39. dSimilarity = 1/(1+np.sqrt(np.sum(dSimilarity)))
  40. return dSimilarity
  41. def genDLMatrix(self, trainDataset, dataSize4D=100, dataSize4L=50):
  42. """
  43. automatically generate the D and L matrix from training data set, assuming the training data set is all normal
  44. state data.
  45. :param trainDataset: 2D array, [count of data, length of feature]
  46. :param dataSize4D: minimized count of state for matrix D
  47. :param dataSize4L: minimized count of state for matrix L
  48. :return: 0 if successful, -1 if fail
  49. """
  50. [m,n] = np.shape(trainDataset)
  51. if m < dataSize4D + dataSize4L:
  52. print('training dataset is too small to generate the matrix D and L')
  53. return -1
  54. self.matrixD = []
  55. selectIndex4D = []
  56. # Step 1: add all the state with minimum or maximum value in each feature into Matrix D
  57. for i in range(0, n):
  58. feature_i = trainDataset[:,i]
  59. minIndex = np.argmin(feature_i)
  60. maxIndex = np.argmax(feature_i)
  61. self.matrixD.append(trainDataset[minIndex, :].tolist())
  62. selectIndex4D.append(minIndex)
  63. self.matrixD.append(trainDataset[maxIndex, :].tolist())
  64. selectIndex4D.append(maxIndex)
  65. # Step 2: iteratively add the state with the maximum average distance to the states in selected matrixD
  66. while(1):
  67. if (len(selectIndex4D) >= dataSize4D):
  68. break
  69. # Get the free state list
  70. freeStateList = list(set(np.arange(0, len(trainDataset))) - set(selectIndex4D))
  71. # Calculate the average dist of each state in free to selected state in matrix D
  72. distList = []
  73. for i in freeStateList:
  74. tmpState = trainDataset[i]
  75. tmpDist = [1-self.calcSimilarity(x, tmpState) for x in self.matrixD]
  76. distList.append(np.mean(tmpDist))
  77. # select the free state with largest average distance to states in matrixD, and update matrixD
  78. selectId = freeStateList[distList.index(np.max(distList))]
  79. self.matrixD.append(trainDataset[selectId, :].tolist())
  80. selectIndex4D.append(selectId)
  81. #format matrixD from list to array
  82. self.matrixD = np.array(self.matrixD)
  83. self.normalDataBallTree = BallTree(self.matrixD, leaf_size=4, metric = lambda i,j: 1-self.calcSimilarity(i,j))
  84. # Step 3. select remaining state for matrix L
  85. #index4L = list(set(np.arange(0, len(trainDataset))) - set(selectIndex4D))
  86. #self.matrixL = trainDataset[index4L, :]
  87. # consider the limited dataset, using all the train data to matrix L
  88. self.matrixL = trainDataset
  89. # Calculate the healthy Residual from matrix L
  90. lamdaRatio = 1e-3
  91. [m, n] = np.shape(self.matrixD)
  92. self.DDSimilarity = np.array([[1-self.calcSimilarity(x,y) for x in self.matrixD] for y in self.matrixD] + lamdaRatio*np.eye(n))
  93. self.DDSimilarity = 1/self.DDSimilarity
  94. #self.healthyResidual = self.calcResidual(self.matrixL)
  95. self.healthyResidual = self.calcResidualByLocallyWeightedLR(self.matrixL)
  96. return 0
  97. def calcResidualByLocallyWeightedLR(self, newStates):
  98. """
  99. find the K-nearest neighbors for each input state, then calculate the estimated state by locally weighted average.
  100. :param newStates: input states list
  101. :return: residual R_x
  102. """
  103. [m,n] = np.shape(newStates)
  104. est_X = []
  105. # print(newStates)
  106. for x in newStates:
  107. (dist, iList) = self.normalDataBallTree.query([x], 20, return_distance=True)
  108. weight = 1/(dist[0]+1e-1)
  109. weight = weight/sum(weight)
  110. eState = np.sum([w*self.matrixD[i] for w,i in zip(weight, iList[0])])
  111. est_X.append(eState)
  112. est_X = np.reshape(est_X, (len(est_X),1))
  113. # print(est_X)
  114. # print(newStates)
  115. return est_X - newStates
  116. def calcStateResidual(self, newsStates):
  117. stateResidual = self.calcResidualByLocallyWeightedLR(newsStates)
  118. return stateResidual
  119. def calcSPRT(self, newsStates, feature_weight, alpha=0.1, beta=0.1, decisionGroup=5):
  120. """
  121. anomaly detection based Wald's SPRT algorithm, refer to A.Wald, Sequential Analysis,Wiley, New York, NY, USA, 1947
  122. :param newsStates: input states list
  123. :param feature_weight: the important weight for each feature, Normalized and Nonnegative
  124. :param alpha: prescribed false alarm rate, 0 < alpha < 1
  125. :param beta: prescribed miss alarm rate, 0 < beta < 1
  126. :param decisionGroup: length of the test sample when the decision is done
  127. :return: anomaly flag for each group of states, 1:anomaly, -1:normal, (-1:1): unable to decision
  128. """
  129. # Step 1. transfer the raw residual vector to dimension reduced residual using feature weight
  130. #stateResidual = self.calcResidual(newsStates)
  131. stateResidual = self.calcResidualByLocallyWeightedLR(newsStates)
  132. # print(stateResidual)
  133. weightedStateResidual = [np.dot(x, feature_weight) for x in stateResidual]
  134. # print(weightedStateResidual)
  135. weightedHealthyResidual = [np.dot(x, feature_weight) for x in self.healthyResidual]
  136. '''
  137. plt.subplot(211)
  138. plt.plot(weightedHealthyResidual)
  139. plt.xlabel('Sample index')
  140. plt.ylabel('Healthy State Residual')
  141. plt.subplot(212)
  142. plt.plot(weightedStateResidual)
  143. plt.xlabel('Sample index')
  144. plt.ylabel('All State Residual')
  145. plt.show()
  146. '''
  147. # Calculate the distribution of health state residual
  148. mu0 = np.mean(weightedHealthyResidual)
  149. sigma0 = np.std(weightedHealthyResidual)
  150. #sigma1 = np.std(weightedStateResidual)
  151. lowThres = np.log(beta/(1-alpha))
  152. highThres = np.log((1-beta)/alpha)
  153. flag = []
  154. for i in range(0, len(newsStates)-decisionGroup+1):
  155. # For each group to calculate the new state residual distribution
  156. # Then check the hypothesis testing results
  157. mu1 = np.mean(weightedStateResidual[i:i+decisionGroup])
  158. si = np.sum(weightedStateResidual[i:i+decisionGroup])*(mu1-mu0)/sigma0**2 - decisionGroup*(mu1**2-mu0**2)/(2*sigma0**2)
  159. if (si > highThres):
  160. si = highThres
  161. if (si < lowThres):
  162. si = lowThres
  163. if (si > 0):
  164. si = si/highThres
  165. if (si < 0):
  166. si = si/lowThres
  167. si = 100-si*100
  168. flag.append(si)
  169. return flag
  170. def CRITIC_prepare(self, data, flag=1): # 计算权重前的数据预处理
  171. '''
  172. :param data: 输入数据,类型是DataFrame
  173. :param flag: flag=0特征正向归一化,flag=1特征负向归一化
  174. :return:返回 DataFrame数据
  175. '''
  176. data_columns = data.columns.values
  177. maxnum = np.max(data, axis=0)
  178. minnum = np.min(data, axis=0)
  179. if flag == 0: # 正向指标归一化计算
  180. Y = (data - minnum) * 1.0 / (maxnum - minnum)
  181. if flag == 1: # 负向指标归一化计算
  182. Y = (maxnum - data) / (maxnum - minnum)
  183. # 对ln0处理
  184. Y0 = np.array(Y * 1.0)
  185. Y0[np.where(Y0 == 0)] = 0.00001
  186. Y0 = pd.DataFrame(Y0, columns=data_columns)
  187. return Y0
  188. def CRITIC(self, data): # CRITIC客观法计算子系统中各测点的权重
  189. '''
  190. :param data: 归一化预处理之后的DataFrame数据
  191. :return: 返回权重Series以及按指标排序后的得分项
  192. '''
  193. n, m = data.shape
  194. s = np.std(data, axis=0)
  195. r = np.corrcoef(data, rowvar=False)
  196. a = np.sum(1 - r, axis=1)
  197. c = s * a
  198. w = c / np.sum(c)
  199. # score = np.round(np.sum(data * w, axis=1), 6)
  200. # data['score'] = score
  201. # data.sort_values(by=['score'], ascending=False, inplace=True)
  202. return w
  203. def calcHealthy(self, df_data): # 计算子系统的健康度
  204. cols = df_data.columns
  205. df_data[cols] = df_data[cols].apply(pd.to_numeric, errors='coerce')
  206. df_data = df_data.dropna() # 删除空行和非数字项
  207. W_prepare_data = self.CRITIC_prepare(df_data)
  208. Weights_data = self.CRITIC(W_prepare_data)
  209. df_data_values = df_data.values
  210. df_data_values = np.array(df_data_values)
  211. [m, n] = np.shape(df_data_values)
  212. # Residual = []
  213. flag_Spart_data = []
  214. for i in range(0, n):
  215. df_data_i = df_data_values[:, i]
  216. trainDataSet_data = df_data_i[0:len(df_data_i) // 2]
  217. testDataSet_data = df_data_i[len(df_data_i) // 2 + 1:]
  218. trainDataSet_data = np.reshape(trainDataSet_data, (len(trainDataSet_data), 1))
  219. testDataSet_data = np.reshape(testDataSet_data, (len(testDataSet_data), 1))
  220. self.genDLMatrix(trainDataSet_data, 60, 5)
  221. # stateResidual = self.calcStateResidual(testDataSet_data)
  222. flag_data = self.calcSPRT(testDataSet_data, np.array(1), decisionGroup=1)
  223. # Residual.append(stateResidual)
  224. flag_Spart_data.append(flag_data)
  225. # weights = np.array([1/3, 1/3, 1/3])
  226. W_data = np.array(Weights_data)
  227. W_data = W_data.reshape(-1, 1)
  228. flag_Spart_data = np.array(flag_Spart_data)
  229. flag_Spart_data = flag_Spart_data.T
  230. Score_data = np.dot(flag_Spart_data, W_data)
  231. Health_data = np.mean(Score_data)
  232. return Weights_data, Health_data
  233. def ahp(self, matrix): #层次分析法ahp(主观)计算风机各子系统的权重
  234. # 计算特征值和特征向量
  235. eigenvalue, eigenvector = np.linalg.eig(matrix)
  236. # 提取最大特征值对应的特征向量,并归一化
  237. max_eigenvalue_index = np.argmax(eigenvalue)
  238. max_eigenvalue = eigenvalue[max_eigenvalue_index]
  239. weight = eigenvector[:, max_eigenvalue_index]
  240. weight = weight / np.sum(weight)
  241. weight = weight.real
  242. return weight, max_eigenvalue
  243. def consistency_check(self, matrix, weight): #层次分析法ahp的一致性检验
  244. n = len(matrix)
  245. # 计算一致性指标 CI
  246. lambda_max = np.sum(np.dot(matrix, weight) / weight) / n
  247. CI = (lambda_max - n) / (n - 1)
  248. # 随机一致性指标 RI 的值,这里我们假设矩阵大小不超过10
  249. RI_list = [0, 0, 0.58, 0.90, 1.12, 1.24, 1.32, 1.41, 1.45, 1.49, 1.52, 1.54, 1.56, 1.58, 1.59, 1.61, 1.63, 1.64,
  250. 1.65]
  251. RI = RI_list[n - 1]
  252. # 计算一致性比例 CR
  253. CR = CI / RI
  254. return CI, CR
  255. if __name__=='__main__':
  256. start_time = time.time()
  257. myMSET = MSET()
  258. # 1、计算各子系统的健康度(子系统包括:发电机组、传动系统(直驱机组无齿轮箱、无数据)、机舱系统、变流器系统、电网环境、辅助系统(无数据))
  259. # 1.1、发电机组健康度评分
  260. df_Gen = pd.read_excel(r'F26_SEP_ZHANGYAOXIAN.xlsx',
  261. usecols=[10, 11, 12, 17]) # 读取发电机组指标:发电机U相温度10、发电机V相温度11、发电机W相温度12、发电机轴承温度17
  262. W_Gen,Health_Gen = myMSET.calcHealthy(df_Gen)
  263. print(W_Gen)
  264. print('发电机组健康度:', Health_Gen)
  265. # 1.2、机舱系统健康度评分
  266. df_Nac = pd.read_excel(r'F26_SEP_ZHANGYAOXIAN.xlsx',
  267. usecols=[23,24,41,42]) # 读取机舱系统指标:塔筒前后振动23、塔筒左右振动24、机舱位置41、机舱温度42
  268. W_Nac, Health_Nac = myMSET.calcHealthy(df_Nac)
  269. print(W_Nac)
  270. print('机舱系统健康度:', Health_Nac)
  271. # 1.3、变流器系统健康度评分
  272. df_Con = pd.read_excel(r'F26_SEP_ZHANGYAOXIAN.xlsx',
  273. usecols=[18,19]) # 读取变流器系统指标:变流器有功功率19、变流器冷却介质温度18
  274. W_Con, Health_Con = myMSET.calcHealthy(df_Con)
  275. print(W_Con)
  276. print('变流器系统健康度:', Health_Con)
  277. # 1.4、电网环境健康度评分
  278. df_Grid = pd.read_excel(r'F26_SEP_ZHANGYAOXIAN.xlsx',
  279. usecols=[32,38,64,65,66]) # 读取电网环境指标:有功功率38、无功功率32、电网A相电流64、电网B相电流65、电网C相电流66
  280. W_Grid, Health_Grid = myMSET.calcHealthy(df_Grid)
  281. print(W_Grid)
  282. print('电网环境健康度:', Health_Grid)
  283. # 2、计算各子系统的权重
  284. # 输入判断矩阵(发电机组、机舱系统、变流器系统、电网环境)
  285. matrix_subsys = np.array([
  286. [1, 2, 3, 4],
  287. [1/2, 1, 2, 3],
  288. [1/3, 1/2, 1, 2],
  289. [1/4, 1/3, 1/2, 1],
  290. ])
  291. # 计算判断矩阵的权重和最大特征值
  292. weight_subsys, max_eigenvalue_subsys = myMSET.ahp(matrix_subsys)
  293. print(weight_subsys)
  294. # 检查一致性
  295. CI1, CR1 = myMSET.consistency_check(matrix_subsys, weight_subsys)
  296. # 3、计算整机的健康度
  297. Score_subsys = np.array([Health_Gen, Health_Nac, Health_Con, Health_Grid])
  298. weight_subsys = weight_subsys.reshape(-1, 1)
  299. Score_WT = np.dot(Score_subsys, weight_subsys)
  300. print('整机健康度:', Score_WT)
  301. end_time = time.time()
  302. execution_time = end_time - start_time
  303. print(f"Execution time: {execution_time} seconds")
  304. #
  305. '''
  306. f = open("speed_vib.txt", "r")
  307. data1 = f.read()
  308. f.close()
  309. data1 = data1.split('\n')
  310. rpm = [(row.split('\t')[0]).strip() for row in data1]
  311. vib = [(row.split('\t')[-1]).strip() for row in data1]
  312. # print(rpm)
  313. rpm = np.array(rpm).astype(np.float64)
  314. vib = np.array(vib).astype(np.float64)
  315. #vib = [(x-np.mean(vib))/np.std(vib) for x in vib]
  316. #print(vib)
  317. trainDataSet = [vib[i] for i in range(0,100) if vib[i] < 5]
  318. trainDataSet = np.reshape(trainDataSet,(len(trainDataSet),1))
  319. testDataSet = np.reshape(vib, (len(vib),1))
  320. '''
  321. # Title = pd.read_csv(r'F1710001001.csv', header=None, nrows=1, usecols=[36,37,38], encoding='gbk')
  322. '''
  323. alarmedFlag = np.array([[i, flag[i]] for i, x in enumerate(flag) if x > 0.99]) # Flag中选出大于0.99的点
  324. plt.close('all')
  325. plt.subplot(211)
  326. plt.plot(testDataSet)
  327. plt.ylabel('Vibration')
  328. plt.xlabel('Sample index')
  329. plt.subplot(212)
  330. plt.plot(flag)
  331. plt.ylabel('SPRT results')
  332. plt.xlabel('Sample index')
  333. plt.scatter(alarmedFlag[:,0], alarmedFlag[:,1], marker='x',c='r')
  334. plt.show()
  335. '''