张崾先统计缺失率-分.py 2.7 KB

12345678910111213141516171819202122232425262728293031323334353637383940414243444546474849505152535455565758596061626364656667
  1. import multiprocessing
  2. import os
  3. import sys
  4. sys.path.insert(0, os.path.abspath(__file__).split("tmp_file")[0])
  5. import pandas as pd
  6. from utils.file.trans_methods import read_file_to_df
  7. def save_percent(value, save_decimal=7):
  8. return round(value, save_decimal) * 100
  9. def read_and_select(file_path, read_cols):
  10. result_df = pd.DataFrame()
  11. df = read_file_to_df(file_path, read_cols=read_cols)
  12. wind_name = os.path.basename(file_path).split('.')[0]
  13. df['风机号'] = wind_name
  14. df = df.query("(startTime>='2023-10-01 00:00:00') & (startTime<'2024-10-01 00:00:00')")
  15. count = 366 * 24 * 6 # 十分钟数据 2024年366天
  16. repeat_time_count = df.shape[0] - len(df['startTime'].unique())
  17. print(wind_name, count, repeat_time_count)
  18. result_df['风机号'] = [wind_name]
  19. result_df['重复率'] = [save_percent(repeat_time_count / count)]
  20. result_df['重复次数'] = [repeat_time_count]
  21. result_df['总记录数'] = [count]
  22. for read_col in read_cols:
  23. if read_col != 'startTime':
  24. df[read_col] = pd.to_numeric(df[read_col], errors='coerce')
  25. else:
  26. df[read_col] = pd.to_datetime(df[read_col], errors='coerce')
  27. group_df = df.groupby(by=['风机号']).count()
  28. group_df.reset_index(inplace=True)
  29. count_df = pd.DataFrame(group_df)
  30. total_count = count_df[read_cols].values[0].sum()
  31. print(wind_name, total_count, count * len(read_cols))
  32. result_df['平均缺失率,单位%'] = [save_percent(1 - total_count / (count * len(read_cols)))]
  33. result_df['缺失数值'] = ['-'.join([str(count - i) for i in count_df[read_cols].values[0]])]
  34. del group_df
  35. error_fengsu_count = df.query("(风速10min < 0) | (风速10min > 80)").shape[0]
  36. error_yougong_gonglv = df.query("(有功功率 < -200) | (有功功率 > 4800)").shape[0]
  37. result_df['平均异常率'] = [save_percent((error_fengsu_count + error_yougong_gonglv) / (2 * count))]
  38. return result_df
  39. if __name__ == '__main__':
  40. read_cols_str = 'startTime,有功功率,叶轮转速,发电机转速,风速10min,桨叶1角度,桨叶2角度,桨叶3角度,机舱位置,偏航误差,发电机轴承温度,机舱内温度,环境温度,发电机U相温度,发电机V相温度,发电机W相温度'
  41. read_cols = [i for i in read_cols_str.split(",") if i]
  42. read_dir = r'/data/download/collection_data/1进行中/张崾先风电场-陕西-华电/收资数据/导出数据2'
  43. files = os.listdir(read_dir)
  44. with multiprocessing.Pool(16) as pool:
  45. dfs = pool.starmap(read_and_select, [(os.path.join(read_dir, i), read_cols) for i in files])
  46. df = pd.concat(dfs, ignore_index=True)
  47. df.sort_values(by=['风机号'], inplace=True)
  48. df.to_csv("张崾先统计-分钟.csv", encoding='utf8', index=False)