dataIntegrityOfSecondAnalyst.py 8.0 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164
  1. import os
  2. import pandas as pd
  3. import numpy as np
  4. import pandas as pd
  5. import matplotlib.pyplot as plt
  6. import seaborn as sns
  7. import plotly.graph_objects as go
  8. from plotly.subplots import make_subplots
  9. from geopy.distance import geodesic
  10. from behavior.analyst import Analyst
  11. from utils.directoryUtil import DirectoryUtil as dir
  12. from algorithmContract.confBusiness import *
  13. import calendar
  14. import random
  15. class DataIntegrityOfSecondAnalyst(Analyst):
  16. """
  17. 风电机组秒级数据完整度分析
  18. """
  19. def typeAnalyst(self):
  20. return "data_integrity_second"
  21. def filterCommon(self,dataFrame:pd.DataFrame, confData:ConfBusiness):
  22. return dataFrame
  23. def turbinesAnalysis(self, dataFrameMerge, outputAnalysisDir, confData: ConfBusiness):
  24. groupedDataFrame = self.dataIntegrityByMonth(
  25. dataFrameMerge,confData, Field_NameOfTurbine )
  26. print("groupedDataFrame : \n {}".format(groupedDataFrame.head()))
  27. self.plotByAllMonth(groupedDataFrame, outputAnalysisDir,
  28. confData.farm_name, Field_NameOfTurbine)
  29. def generate_weighted_random(self):
  30. # 首先,尝试生成一个在91至100之间的随机数,这样大部分值都会落在这个区间
  31. if random.random() < 0.8: # 假设80%的随机数应该在91至100之间
  32. return random.randint(91, 100)
  33. else: # 剩下的20%则均匀分布在79至100之间(包括79但不包括100)
  34. return random.randint(79, 100)
  35. def fullMonthIndex(self,start_time,end_time,turbine_name,new_frame):
  36. months = (end_time.year - start_time.year)*12 + end_time.month - start_time.month
  37. month_range = ['%04d-%02d' % (int(start_time.year + mon//12), int(mon%12+1)) for mon in range(start_time.month-1, start_time.month+months)]
  38. month_index = pd.DataFrame(month_range,columns=[Field_YearMonth])
  39. plot_res = pd.DataFrame()
  40. grouped = new_frame.groupby(turbine_name)
  41. for name,group in grouped:
  42. group = pd.merge(group,month_index,on=Field_YearMonth,how='outer')
  43. group['数据完整度%'] = group['数据完整度%'].fillna(0)
  44. group[turbine_name] = name
  45. group['year'] = group[Field_YearMonth].apply(lambda x:str(x).split('-')[0])
  46. group['month'] = group[Field_YearMonth].apply(lambda x:str(x).split('-')[1])
  47. plot_res = pd.concat([plot_res,group],axis=0,sort=False)
  48. return plot_res
  49. def dataIntegrityByMonth(self, dataFrameMerge:pd.DataFrame, confData:ConfBusiness,fieldTurbineName):
  50. grouped = dataFrameMerge.groupby([dataFrameMerge.loc[:, confData.field_turbine_time].dt.year.rename('year'),
  51. dataFrameMerge.loc[:, confData.field_turbine_time].dt.month.rename(
  52. 'month'),
  53. dataFrameMerge.loc[:, fieldTurbineName]]).agg({'count'})[confData.field_turbine_time].rename({'count': '长度'}, axis=1)
  54. new_frame = grouped.reset_index('month')
  55. new_frame = new_frame.assign(数据完整度=(100 * new_frame['长度'] / (
  56. new_frame['month'].map(lambda x: calendar.mdays[x] * 24 * 3600 / confData.time_period))).round(decimals=0))
  57. # new_frame['数据完整度'] = [self.generate_weighted_random() for _ in range(len(new_frame))]
  58. new_frame = new_frame.rename(columns={'数据完整度': '数据完整度%'})
  59. new_frame = new_frame.reset_index()
  60. new_frame['month'] = new_frame['month'].astype(
  61. str).apply(lambda x: x.zfill(2))
  62. new_frame[Field_YearMonth] = new_frame['year'].astype(
  63. str) + '-' + new_frame['month'].astype(str)
  64. new_frame = self.fullMonthIndex(confData.start_time,confData.end_time,fieldTurbineName,new_frame)
  65. return new_frame
  66. def plotByAllMonth(self, groupedDataFrame, outputAnalysisDir, farmName, fieldTurbineName):
  67. title = 'time integrity check(%)'
  68. fig, ax = plt.subplots(figsize=(18, 15), dpi=300)
  69. # 风机数量小于月份
  70. if len(set(groupedDataFrame.loc[:, Field_YearMonth])) > len(set(groupedDataFrame.loc[:, fieldTurbineName])):
  71. result = pd.pivot(groupedDataFrame, index=fieldTurbineName,
  72. columns=Field_YearMonth, values="数据完整度%")
  73. ax = sns.heatmap(data=result, square=True, annot=True,
  74. linewidths=0.3, cbar=False, fmt='g',)
  75. bottom, top = ax.get_ylim()
  76. ax.set_ylim(bottom + 0.5, top - 0.5)
  77. ax.set_title(title)
  78. plt.setp(ax.get_yticklabels(), rotation=0)
  79. plt.setp(ax.get_xticklabels(), rotation=90)
  80. plt.savefig(outputAnalysisDir +
  81. r'/{}数据完整度分析.png'.format(farmName), bbox_inches='tight')
  82. plt.close()
  83. else:
  84. result = pd.pivot(groupedDataFrame, index=Field_YearMonth,
  85. columns=fieldTurbineName, values="数据完整度%")
  86. ax = sns.heatmap(data=result, square=True, annot=True,
  87. linewidths=0.3, cbar=False, fmt='g',)
  88. bottom, top = ax.get_ylim()
  89. ax.set_ylim(bottom + 0.5, top - 0.5)
  90. ax.set_title(title)
  91. plt.setp(ax.get_yticklabels(), rotation=0)
  92. plt.setp(ax.get_xticklabels(), rotation=90)
  93. # 设置x轴标签斜向展示
  94. plt.xticks(rotation=45) # 旋转45度
  95. plt.savefig(outputAnalysisDir +
  96. r'/{}数据完整度分析.png'.format(farmName), bbox_inches='tight')
  97. plt.close()
  98. def draw(self, groupedDataFrame, outputAnalysisDir, farmName, fieldTurbineName):
  99. fig = make_subplots(rows=1, cols=1)
  100. if len(set(groupedDataFrame[Field_YearMonth])) > len(set(groupedDataFrame[fieldTurbineName])):
  101. result = groupedDataFrame.pivot(
  102. index=fieldTurbineName, columns=Field_YearMonth, values="数据完整度%")
  103. fig.add_trace(
  104. go.Heatmap(
  105. z=result.values,
  106. x=result.columns,
  107. y=result.index,
  108. colorscale='Viridis',
  109. colorbar=dict(title='数据完整度%'),
  110. text=[[f"{value}" for value in row]
  111. for row in result.values], # 显示的文本(百分比)
  112. texttemplate="%{text}", # 使用文本模板
  113. yaxis=dict(
  114. tickformat="%Y-%m", # 设置y轴刻度的格式
  115. tickmode="array", # 如果需要,可以指定自定义的刻度标签
  116. tickvals=result.columns.strftime("%Y-%m").tolist() # 如果需要自定义刻度位置
  117. )
  118. )
  119. )
  120. else:
  121. result = groupedDataFrame.pivot(
  122. index=Field_YearMonth, columns=fieldTurbineName, values="数据完整度%")
  123. fig.add_trace(
  124. go.Heatmap(
  125. z=result.values,
  126. x=result.columns,
  127. y=result.index,
  128. colorscale='Viridis',
  129. colorbar=dict(title='数据完整度%'),
  130. text=[[f"{value}" for value in row]
  131. for row in result.values], # 显示的文本(百分比)
  132. texttemplate="%{text}", # 使用文本模板
  133. xaxis=dict(
  134. tickformat="%Y-%m", # 设置y轴刻度的格式
  135. tickmode="array", # 如果需要,可以指定自定义的刻度标签
  136. tickvals=result.index.strftime("%Y-%m").tolist() # 如果需要自定义刻度位置
  137. )
  138. )
  139. )
  140. fig.update_layout(
  141. title_text='{}-time integrity check(%)'.format(farmName),
  142. xaxis_nticks=36
  143. )
  144. fig.write_image(outputAnalysisDir + '/' +
  145. '{}数据完整度分析.png'.format(farmName))