神木_完整度_1分.py 3.0 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990
  1. # coding=utf-8
  2. import datetime
  3. import multiprocessing
  4. import os
  5. import sys
  6. sys.path.insert(0, os.path.abspath(__file__).split("utils")[0])
  7. import pandas as pd
  8. from utils.file.trans_methods import read_file_to_df, read_excel_files
  9. def get_time_space_count(start_time: datetime.datetime, end_time: datetime.datetime, time_space=1):
  10. """
  11. 获取俩个时间之间的个数
  12. :return: 查询时间间隔
  13. """
  14. delta = end_time - start_time
  15. total_seconds = delta.days * 24 * 60 * 60 + delta.seconds
  16. return abs(int(total_seconds / time_space)) + 1
  17. def save_percent(value, save_decimal=7):
  18. return round(value, save_decimal) * 100
  19. def read_and_select(file_path):
  20. try:
  21. result_df = pd.DataFrame()
  22. df = read_file_to_df(file_path)
  23. read_cols_bak = df.columns.tolist()
  24. wind_name = df['名称'].values[0]
  25. df['时间'] = pd.to_datetime(df['时间'])
  26. count = get_time_space_count(df['时间'].min(), df['时间'].max(), 60)
  27. repeat_time_count = df.shape[0] - len(df['时间'].unique())
  28. print(wind_name, count, repeat_time_count)
  29. result_df['风机号'] = [wind_name]
  30. result_df['重复率'] = [save_percent(repeat_time_count / count)]
  31. result_df['重复次数'] = [repeat_time_count]
  32. result_df['总记录数'] = [count]
  33. read_cols_bak.remove('名称')
  34. read_cols = list()
  35. for read_col in read_cols_bak:
  36. if read_col == '时间':
  37. df[read_col] = pd.to_datetime(df[read_col], errors='coerce')
  38. read_cols.append(read_col)
  39. else:
  40. df[read_col] = pd.to_numeric(df[read_col], errors='coerce')
  41. if not df[read_col].isnull().all():
  42. read_cols.append(read_col)
  43. group_df = df.groupby(by=['名称']).count()
  44. group_df.reset_index(inplace=True)
  45. count_df = pd.DataFrame(group_df)
  46. total_count = count_df[read_cols].values[0].sum()
  47. print(wind_name, total_count, count * len(read_cols))
  48. result_df['平均缺失率,单位%'] = [save_percent(1 - total_count / (count * len(read_cols)))]
  49. result_df['缺失数值'] = [
  50. '-'.join([f'{col_name}_{str(count - i)}' for col_name, i in zip(read_cols, count_df[read_cols].values[0])])]
  51. del group_df
  52. error_fengsu_count = df.query("(风速 < 0) | (风速 > 80)").shape[0]
  53. error_yougong_gonglv = df.query("(发电机有功功率 < -200) | (发电机有功功率 > 2500)").shape[0]
  54. result_df['平均异常率'] = [save_percent((error_fengsu_count + error_yougong_gonglv) / (2 * count))]
  55. except Exception as e:
  56. print(file_path)
  57. raise e
  58. return result_df
  59. if __name__ == '__main__':
  60. read_dir = r'D:\data\tmp_data\1分\远景1min'
  61. files = read_excel_files(read_dir)
  62. with multiprocessing.Pool(4) as pool:
  63. dfs = pool.map(read_and_select, files)
  64. df = pd.concat(dfs, ignore_index=True)
  65. df.sort_values(by=['风机号'], inplace=True)
  66. df.to_csv("神木风电场-1分钟.csv", encoding='utf8', index=False)