# coding=utf-8 import datetime import multiprocessing import os import sys sys.path.insert(0, os.path.abspath(__file__).split("utils")[0]) import pandas as pd from utils.file.trans_methods import read_file_to_df, read_excel_files def get_time_space_count(start_time: datetime.datetime, end_time: datetime.datetime, time_space=1): """ 获取俩个时间之间的个数 :return: 查询时间间隔 """ delta = end_time - start_time total_seconds = delta.days * 24 * 60 * 60 + delta.seconds return abs(int(total_seconds / time_space)) + 1 def save_percent(value, save_decimal=7): return round(value, save_decimal) * 100 def read_and_select(file_path): try: result_df = pd.DataFrame() df = read_file_to_df(file_path) read_cols_bak = df.columns.tolist() wind_name = df['名称'].values[0] df['时间'] = pd.to_datetime(df['时间']) count = get_time_space_count(df['时间'].min(), df['时间'].max(), 60) repeat_time_count = df.shape[0] - len(df['时间'].unique()) print(wind_name, count, repeat_time_count) result_df['风机号'] = [wind_name] result_df['重复率'] = [save_percent(repeat_time_count / count)] result_df['重复次数'] = [repeat_time_count] result_df['总记录数'] = [count] read_cols_bak.remove('名称') read_cols = list() for read_col in read_cols_bak: if read_col == '时间': df[read_col] = pd.to_datetime(df[read_col], errors='coerce') read_cols.append(read_col) else: df[read_col] = pd.to_numeric(df[read_col], errors='coerce') if not df[read_col].isnull().all(): read_cols.append(read_col) group_df = df.groupby(by=['名称']).count() group_df.reset_index(inplace=True) count_df = pd.DataFrame(group_df) total_count = count_df[read_cols].values[0].sum() print(wind_name, total_count, count * len(read_cols)) result_df['平均缺失率,单位%'] = [save_percent(1 - total_count / (count * len(read_cols)))] result_df['缺失数值'] = [ '-'.join([f'{col_name}_{str(count - i)}' for col_name, i in zip(read_cols, count_df[read_cols].values[0])])] del group_df error_fengsu_count = df.query("(风速 < 0) | (风速 > 80)").shape[0] error_yougong_gonglv = df.query("(发电机有功功率 < -200) | (发电机有功功率 > 2500)").shape[0] result_df['平均异常率'] = [save_percent((error_fengsu_count + error_yougong_gonglv) / (2 * count))] except Exception as e: print(file_path) raise e return result_df if __name__ == '__main__': read_dir = r'D:\data\tmp_data\1分\远景1min' files = read_excel_files(read_dir) with multiprocessing.Pool(4) as pool: dfs = pool.map(read_and_select, files) df = pd.concat(dfs, ignore_index=True) df.sort_values(by=['风机号'], inplace=True) df.to_csv("神木风电场-1分钟.csv", encoding='utf8', index=False)