9 months ago · 05cdb42415
--- a/etl/wind_power/fault_warn/FaultWarnTrans.py
+++ b/etl/wind_power/fault_warn/FaultWarnTrans.py
@@ -73,11 +73,13 @@ class FaultWarnTrans(BaseDataTrans):
 
				                 if valid_eval(time_format):
			
 
				                     eval_str = f"df['begin_time'].apply(lambda error_time: {time_format} )"
			
 
				                     df['begin_time'] = eval(eval_str)
			
 
				-                    eval_str = f"df['end_time'].apply(lambda error_time: {time_format} )"
			
 
				-                    df['end_time'] = eval(eval_str)
			
 
				+                    if 'end_time' in df.columns:
			
 
				+                        eval_str = f"df['end_time'].apply(lambda error_time: {time_format} )"
			
 
				+                        df['end_time'] = eval(eval_str)
			
 
				 
			
 
				             df['begin_time'] = pd.to_datetime(df['begin_time'], errors='coerce')
			
 
				-            df['end_time'] = pd.to_datetime(df['end_time'], errors='coerce')
			
 
				+            if 'end_time' in df.columns:
			
 
				+                df['end_time'] = pd.to_datetime(df['end_time'], errors='coerce')
			
 
				 
			
 
				             exec_wind_turbine_number = read_conf(conf_map, 'exec_wind_turbine_number')
			
 
				             if exec_wind_turbine_number:
			
@@ -106,11 +108,12 @@ class FaultWarnTrans(BaseDataTrans):
 
				             df['wind_turbine_number'] = df['wind_turbine_number'].map(
			
 
				                 self.wind_col_trans).fillna(df['wind_turbine_number'])
			
 
				 
			
 
				-            df['time_diff'] = (df['end_time'] - df['begin_time']).dt.total_seconds()
			
 
				-            df.loc[df['time_diff'] < 0, 'time_diff'] = np.nan
			
 
				+            if 'end_time' in df.columns:
			
 
				+                df['time_diff'] = (df['end_time'] - df['begin_time']).dt.total_seconds()
			
 
				+                df.loc[df['time_diff'] < 0, 'time_diff'] = np.nan
			
 
				 
			
 
				             # 根绝开始时间进行排序
			
 
				-            df.sort_values(by=['wind_turbine_number', 'begin_time', 'end_time'], inplace=True)
			
 
				+            df.sort_values(by=['wind_turbine_number', 'begin_time'], inplace=True)
			
 
				 
			
 
				             if self.save_zip:
			
 
				                 save_path = os.path.join(self.pathsAndTable.get_save_path(), str(self.batch_name) + '.csv.gz')
			
--- a/etl/wind_power/min_sec/ClassIdentifier.py
+++ b/etl/wind_power/min_sec/ClassIdentifier.py
@@ -1,4 +1,5 @@
 
				 import datetime
			
 
				+import traceback
			
 
				 
			
 
				 import numpy as np
			
 
				 from pandas import DataFrame
			
@@ -350,6 +351,11 @@ class ClassIdentifier(object):
 
				         # Implement your class identification logic here
			
 
				         begin = datetime.datetime.now()
			
 
				         trans_print("打标签开始,风机号:", self.wind_turbine_number, self.df.shape)
			
 
				-        df = self.identifier()
			
 
				+        try:
			
 
				+            df = self.identifier()
			
 
				+        except Exception as e:
			
 
				+            trans_print(traceback.format_exc())
			
 
				+            message = str(e) + ',风机编号:' + self.wind_turbine_number
			
 
				+            raise Exception('打标签失败:' + message)
			
 
				         trans_print("打标签结束,", df.shape, ",耗时:", datetime.datetime.now() - begin)
			
 
				         return df
			
--- a/etl/wind_power/min_sec/StatisticsAndSaveFile.py
+++ b/etl/wind_power/min_sec/StatisticsAndSaveFile.py
@@ -100,7 +100,9 @@ class StatisticsAndSaveFile(object):
 
				         df.sort_values(by='time_stamp', inplace=True)
			
 
				         df = df[[i for i in self.trans_param.cols_tran.keys() if i in df.columns]]
			
 
				 
			
 
				-        # df['active_power'] = df['active_power'] / 1000
			
 
				+        power = df.sample(int(df.shape[0] / 100))['active_power'].median()
			
 
				+        if power > 10000:
			
 
				+            df['active_power'] = df['active_power'] / 1000
			
 
				         ## 做数据检测前,羡强行处理有功功率
			
 
				         df = df[df['active_power'] < 5000]
			
 
				 
			
--- a/service/plt_service.py
+++ b/service/plt_service.py
@@ -118,6 +118,25 @@ def get_data_by_batch_no_and_type(batch_no, transfer_type):
 
				     return data[0]
			
 
				 
			
 
				 
			
 
				+## 合并多个batch_使用
			
 
				+def get_hebing_data_by_batch_no_and_type(batch_no, transfer_type):
			
 
				+    query_exec_sql = f"""
			
 
				+    SELECT
			
 
				+        t.*,a.field_name,b.batch_name
			
 
				+    FROM
			
 
				+        data_transfer t INNER JOIN wind_field a on t.field_code = a.field_code
			
 
				+        inner join wind_field_batch b on t.batch_code = b.batch_code
			
 
				+    WHERE
			
 
				+         t.trans_sys_status = 1 and t.transfer_state = 1 and t.batch_code = '{batch_no}' and t.transfer_type = '{transfer_type}'
			
 
				+    AND t.transfer_addr != ''
			
 
				+    """
			
 
				+
			
 
				+    data = plt.execute(query_exec_sql)
			
 
				+    if type(data) == tuple:
			
 
				+        return None
			
 
				+    return data[0]
			
 
				+
			
 
				+
			
 
				 def get_all_wind(field_code):
			
 
				     query_sql = """
			
 
				     SELECT t.engine_code,t.engine_name,t.rated_capacity,a.rated_cut_out_windspeed 
			
--- a/test_run_local.py
+++ b/test_run_local.py
@@ -41,7 +41,7 @@ def run(data: dict = dict(), save_db=False, step=0, end=4):
 
				 
			
 
				 
			
 
				 if __name__ == '__main__':
			
 
				-    env = 'prod'
			
 
				+    env = 'dev'
			
 
				     if len(sys.argv) >= 2:
			
 
				         env = sys.argv[1]
			
 
				 
			
@@ -59,14 +59,14 @@ if __name__ == '__main__':
 
				     begin = datetime.datetime.now()
			
 
				     data = dict()
			
 
				 
			
 
				-    data['batch_code'] = 'WOF085500008-WOB000002'
			
 
				-    data['batch_name'] = 'HY秒级数据1009'
			
 
				-    data['transfer_type'] = 'second'
			
 
				-    data['transfer_addr'] = r'/data/download/collection_data/1进行中/红阳风电场-贵州-大唐/收资数据/sec/7-8'
			
 
				-    data['field_code'] = 'WOF085500008'
			
 
				-    data['field_name'] = '红阳风电场'
			
 
				+    data['batch_code'] = 'xinhuashuidian'
			
 
				+    data['batch_name'] = '新华水电故障'
			
 
				+    data['transfer_type'] = 'fault'
			
 
				+    data['transfer_addr'] = r'D:\data\新华水电\收资数据\故障告警\汇能机组数据-故障'
			
 
				+    data['field_code'] = 'xinhuashuidian'
			
 
				+    data['field_name'] = '新华水电'
			
 
				     try:
			
 
				-        run(data=data, save_db=False, step=3, end=3)
			
 
				+        run(data=data, save_db=False, step=0, end=3)
			
 
				     except Exception as e:
			
 
				         trans_print(traceback.format_exc())
			
 
				 
			
--- a/tmp_file/changing_hebing_guzhang.py
+++ b/tmp_file/changing_hebing_guzhang.py
@@ -0,0 +1,48 @@
 
				+import copy
			
 
				+import datetime
			
 
				+
			
 
				+import pandas as pd
			
 
				+
			
 
				+read_path = r'D:\data\长清\故障记录_20230420_20240419.csv'
			
 
				+
			
 
				+df = pd.read_csv(read_path, encoding='gb18030')
			
 
				+
			
 
				+df['风机名'] = df['风机名'].apply(lambda wind_name: 'A' + wind_name.replace('号风机', '').zfill(2))
			
 
				+
			
 
				+df = df[~df['状态码描述'].isin(['高偏航误差穿越', '手动偏航'])]
			
 
				+
			
 
				+df['激活时间'] = pd.to_datetime(df['激活时间'].apply(lambda x: x[0:x.rfind(":")]), errors='coerce')
			
 
				+df['复位时间'] = pd.to_datetime(df['复位时间'].apply(lambda x: x[0:x.rfind(":")]), errors='coerce')
			
 
				+
			
 
				+df.dropna(subset=['激活时间', '复位时间'], inplace=True)
			
 
				+
			
 
				+
			
 
				+def generate_next_10_min(dt):
			
 
				+    minute = dt.minute
			
 
				+    chazhi = 10 - int(minute % 10)
			
 
				+    now = dt + datetime.timedelta(minutes=chazhi)
			
 
				+    now = now.replace(second=0, microsecond=0)
			
 
				+
			
 
				+    return now
			
 
				+
			
 
				+
			
 
				+df['begin_time'] = df['激活时间'].apply(generate_next_10_min)
			
 
				+df['end_time'] = df['复位时间'].apply(generate_next_10_min)
			
 
				+
			
 
				+df['chazhi_count'] = ((df['end_time'] - df['begin_time']).dt.seconds) // 600 + 1
			
 
				+
			
 
				+result_df = df[df['chazhi_count'] == 1]
			
 
				+
			
 
				+datas = [[]]
			
 
				+for index, row in df[df['chazhi_count'] > 1].iterrows():
			
 
				+    for i in range(row['chazhi_count']):
			
 
				+        data = copy.deepcopy(row.values)
			
 
				+        data[6] = data[6] + datetime.timedelta(minutes=10 * i)
			
 
				+        datas.append(data)
			
 
				+
			
 
				+now_df = pd.DataFrame(datas, columns=df.columns)
			
 
				+result_df = pd.concat([result_df, now_df])
			
 
				+
			
 
				+result_df.reset_index(inplace=True, drop=True)
			
 
				+result_df.sort_values(by=['风机名', '激活时间', 'begin_time'], inplace=True)
			
 
				+result_df.to_csv("故障记录.csv", encoding='utf8')
			
--- a/tmp_file/hebing_muti_batch.py
+++ b/tmp_file/hebing_muti_batch.py
@@ -0,0 +1,77 @@
 
				+import multiprocessing
			
 
				+
			
 
				+import os
			
 
				+import pandas as pd
			
 
				+import sys
			
 
				+
			
 
				+sys.path.insert(0, os.path.abspath(__file__).split("tmp_file")[0])
			
 
				+
			
 
				+
			
 
				+def hebing_and_save(new_batch_save_path, name, paths):
			
 
				+    df = pd.DataFrame()
			
 
				+    for path in paths:
			
 
				+        now_df = read_file_to_df(path)
			
 
				+        df = pd.concat([df, now_df])
			
 
				+
			
 
				+    df.sort_values(by=['time_stamp'], inplace=True)
			
 
				+
			
 
				+    create_file_path(new_batch_save_path)
			
 
				+    df.to_csv(os.path.join(new_batch_save_path, name), index=False, encoding='utf8')
			
 
				+
			
 
				+
			
 
				+if __name__ == '__main__':
			
 
				+
			
 
				+    env = 'prod'
			
 
				+    if len(sys.argv) >= 2:
			
 
				+        env = sys.argv[1]
			
 
				+
			
 
				+    from utils.conf.read_conf import yaml_conf
			
 
				+
			
 
				+    conf_path = os.path.abspath(__file__).split("tmp_file")[0] + f"/conf/etl_config_{env}.yaml"
			
 
				+    os.environ['ETL_CONF'] = conf_path
			
 
				+    yaml_config = yaml_conf(conf_path)
			
 
				+    os.environ['env'] = env
			
 
				+
			
 
				+    from utils.file.trans_methods import read_file_to_df, create_file_path
			
 
				+
			
 
				+    from etl.wind_power.fault_warn.FaultWarnTrans import FaultWarnTrans
			
 
				+    from etl.wind_power.min_sec.MinSecTrans import MinSecTrans
			
 
				+    from service.plt_service import get_hebing_data_by_batch_no_and_type
			
 
				+
			
 
				+    save_batch = 'WOF085500008-2-3'
			
 
				+    save_batch_name = '合并'
			
 
				+    trans_type = 'second'
			
 
				+    read_batchs = ['WOF085500008-WOB000003', 'WOF085500008-WOB000002']
			
 
				+    read_paths = list()
			
 
				+
			
 
				+    new_batch_save_path = ''
			
 
				+
			
 
				+    for read_data in read_batchs:
			
 
				+        data = get_hebing_data_by_batch_no_and_type(read_data, trans_type)
			
 
				+        save_db = True
			
 
				+
			
 
				+        exec_process = None
			
 
				+        if data['transfer_type'] in ['second', 'minute']:
			
 
				+            exec_process = MinSecTrans(data=data, save_db=save_db)
			
 
				+
			
 
				+        if data['transfer_type'] in ['fault', 'warn']:
			
 
				+            exec_process = FaultWarnTrans(data=data, save_db=save_db)
			
 
				+
			
 
				+        if exec_process is None:
			
 
				+            raise Exception("No exec process")
			
 
				+
			
 
				+        read_paths.append(exec_process.pathsAndTable.get_save_path())
			
 
				+        new_batch_save_path = os.path.join(exec_process.pathsAndTable.save_path, save_batch + "_" + save_batch_name,
			
 
				+                                           trans_type)
			
 
				+
			
 
				+    file_dict = dict()
			
 
				+
			
 
				+    for read_path in read_paths:
			
 
				+        for file in os.listdir(read_path):
			
 
				+            if file in file_dict:
			
 
				+                file_dict[file].append(os.path.join(read_path, file))
			
 
				+            else:
			
 
				+                file_dict[file] = [os.path.join(read_path, file)]
			
 
				+
			
 
				+    with multiprocessing.Pool(len(file_dict.keys())) as pool:
			
 
				+        pool.starmap(hebing_and_save, [(new_batch_save_path, name, paths) for name, paths in file_dict.items()])
			
--- a/tmp_file/organize_xinhua_files.py
+++ b/tmp_file/organize_xinhua_files.py
@@ -3,7 +3,9 @@ import datetime
 
				 import os
			
 
				 import warnings
			
 
				 
			
 
				+import numpy as np
			
 
				 import pandas as pd
			
 
				+
			
 
				 warnings.filterwarnings("ignore")
			
 
				 
			
 
				 
			
@@ -50,7 +52,7 @@ def create_file_path(path, is_file_path=False):
 
				 
			
 
				 
			
 
				 def boolean_is_check_data(df_cols):
			
 
				-    fault_list = ['快速停机', '故障名称', '故障代码', '故障停机', '人工停机', '风机紧急停机']
			
 
				+    fault_list = ['快速停机', '故障名称', '故障代码', '故障停机', '人工停机', '风机紧急停机', '工作模式', '风机自身故障停机', '限功率运行状态']
			
 
				 
			
 
				     df_cols = [str(i).split('_')[-1] for i in df_cols]
			
 
				     for fault in fault_list:
			
@@ -98,11 +100,11 @@ def save_to_file(dfs, wind_name, save_path='', param='', is_check=False, all_col
 
				 
			
 
				     result_data_list.append(loss_cols)
			
 
				 
			
 
				-    # for col in set(all_cols):
			
 
				-    #     if col not in df.columns:
			
 
				-    #         df[col] = np.nan
			
 
				+    for col in set(all_cols):
			
 
				+        if col not in df.columns:
			
 
				+            df[col] = np.nan
			
 
				 
			
 
				-    # df.to_csv(os.path.join(save_path, param, wind_name + '.csv'), encoding='utf8', index=False)
			
 
				+    df.to_csv(os.path.join(save_path, param, wind_name + '.csv'), encoding='utf8', index=False)
			
 
				 
			
 
				 
			
 
				 if __name__ == '__main__':
			
@@ -146,11 +148,11 @@ if __name__ == '__main__':
 
				                     else:
			
 
				                         data_wind_name[wind_name] = [df]
			
 
				 
			
 
				-        with multiprocessing.Pool(30) as pool:
			
 
				-            pool.starmap(save_to_file,
			
 
				-                         [(dfs, wind_name, save_path, "事件数据", True, check_all_cols, result_data_list) for wind_name, dfs
			
 
				-                          in
			
 
				-                          check_wind_name.items()])
			
 
				+        # with multiprocessing.Pool(30) as pool:
			
 
				+        #     pool.starmap(combine_df,
			
 
				+        #                  [(dfs, wind_name, save_path, "事件数据", True, check_all_cols, result_data_list) for wind_name, dfs
			
 
				+        #                   in
			
 
				+        #                   check_wind_name.items()])
			
 
				 
			
 
				         with multiprocessing.Pool(30) as pool:
			
 
				             pool.starmap(save_to_file,
			
--- a/tmp_file/organize_xinhua_files_data.py
+++ b/tmp_file/organize_xinhua_files_data.py
@@ -0,0 +1,206 @@
 
				+import multiprocessing
			
 
				+import datetime
			
 
				+import os
			
 
				+import warnings
			
 
				+
			
 
				+import numpy as np
			
 
				+import pandas as pd
			
 
				+
			
 
				+warnings.filterwarnings("ignore")
			
 
				+
			
 
				+
			
 
				+def __build_directory_dict(directory_dict, path, filter_types=None):
			
 
				+    # 遍历目录下的所有项
			
 
				+    for item in os.listdir(path):
			
 
				+        item_path = os.path.join(path, item)
			
 
				+        if os.path.isdir(item_path):
			
 
				+            __build_directory_dict(directory_dict, item_path, filter_types=filter_types)
			
 
				+        elif os.path.isfile(item_path):
			
 
				+            if path not in directory_dict:
			
 
				+                directory_dict[path] = []
			
 
				+            if filter_types is None or len(filter_types) == 0:
			
 
				+                directory_dict[path].append(item_path)
			
 
				+            elif str(item_path).split(".")[-1] in filter_types:
			
 
				+                if str(item_path).count("~$") == 0:
			
 
				+                    directory_dict[path].append(item_path)
			
 
				+
			
 
				+
			
 
				+# 读取路径下所有的excel文件
			
 
				+def read_excel_files(read_path):
			
 
				+    if os.path.isfile(read_path):
			
 
				+        return [read_path]
			
 
				+    directory_dict = {}
			
 
				+    __build_directory_dict(directory_dict, read_path, filter_types=['xls', 'xlsx', 'csv', 'gz'])
			
 
				+    return [path for paths in directory_dict.values() for path in paths if path]
			
 
				+
			
 
				+
			
 
				+# 创建路径
			
 
				+def create_file_path(path, is_file_path=False):
			
 
				+    """
			
 
				+    创建路径
			
 
				+    :param path:创建文件夹的路径
			
 
				+    :param is_file_path: 传入的path是否包含具体的文件名
			
 
				+    """
			
 
				+    if is_file_path:
			
 
				+        path = os.path.dirname(path)
			
 
				+    if not os.path.exists(path):
			
 
				+        os.makedirs(path, exist_ok=True)
			
 
				+
			
 
				+
			
 
				+def boolean_is_check_data(df_cols, need_valid=True):
			
 
				+    if not need_valid:
			
 
				+        return True
			
 
				+    fault_list = ['快速停机', '故障名称', '故障代码', '故障停机', '人工停机', '风机紧急停机', '风机自身故障停机', '限功率运行状态']
			
 
				+    df_cols = [str(i).split('_')[-1] for i in df_cols]
			
 
				+    for fault in fault_list:
			
 
				+        if fault in df_cols:
			
 
				+            return True
			
 
				+    return False
			
 
				+
			
 
				+
			
 
				+def read_fle_to_df(file_path):
			
 
				+    df = pd.read_excel(file_path)
			
 
				+    wind_name = [i for i in df.columns if i.find('_') > -1][0].split('_')[0]
			
 
				+    df.columns = [i.split('_')[-1] for i in df.columns]
			
 
				+    df['wind_name'] = wind_name
			
 
				+    df['采样时间'] = pd.to_datetime(df['采样时间'])
			
 
				+    df['采样时间'] = df['采样时间'].dt.ceil('T')
			
 
				+    return boolean_is_check_data(df.columns, file_path.find('批次') > -1), wind_name, df
			
 
				+
			
 
				+
			
 
				+def read_guzhangbaojing(file_path):
			
 
				+    try:
			
 
				+        df = pd.read_excel(file_path)
			
 
				+        df.rename(columns={'风机名': 'wind_name'}, inplace=True)
			
 
				+        df['采样时间'] = pd.to_datetime(df['采样时间'])
			
 
				+        df['采样时间'] = df['采样时间'].dt.ceil('T')
			
 
				+        df = df[(df['采样时间'] >= '2024-08-01 00:00:00') & (df['采样时间'] < '2024-10-01 00:00:00')]
			
 
				+        return df
			
 
				+    except Exception as e:
			
 
				+        print(file_path, e)
			
 
				+        raise e
			
 
				+
			
 
				+
			
 
				+def combine_df(dfs, wind_name, save_path=''):
			
 
				+    print(wind_name)
			
 
				+    cols = list()
			
 
				+    col_map = dict()
			
 
				+    try:
			
 
				+        df = dfs[0]
			
 
				+        cols.extend(df.columns)
			
 
				+        for index, now_df in enumerate(dfs):
			
 
				+            if index > 0:
			
 
				+                for col in now_df.columns:
			
 
				+                    if col in cols and col not in ['采样时间', 'wind_name']:
			
 
				+                        if col in col_map.keys():
			
 
				+                            count = col_map[col]
			
 
				+                            col_map[col] = count + 1
			
 
				+                        else:
			
 
				+                            count = 1
			
 
				+                            col_map[col] = 1
			
 
				+                        now_df.rename(columns={col: col + '__' + str(count)}, inplace=True)
			
 
				+                df = pd.merge(df, now_df, on=['采样时间', 'wind_name'], how='outer')
			
 
				+                cols.extend(now_df.columns)
			
 
				+    except Exception as e:
			
 
				+        print(wind_name, e)
			
 
				+        raise e
			
 
				+    df.reset_index(inplace=True)
			
 
				+    df.drop_duplicates(inplace=True, subset=['采样时间', 'wind_name'])
			
 
				+    if 'index' in df.columns:
			
 
				+        del df['index']
			
 
				+    create_file_path(save_path)
			
 
				+    df.sort_values(by='采样时间', inplace=True)
			
 
				+    df.set_index(keys=['采样时间', 'wind_name'], inplace=True)
			
 
				+    return wind_name, df
			
 
				+
			
 
				+
			
 
				+def sae_to_csv(wind_name, df):
			
 
				+    try:
			
 
				+        col_tuples = [(col.split('__')[0], col) for col in df.columns if col.find('__') > -1]
			
 
				+        col_dict = dict()
			
 
				+        for origin, col in col_tuples:
			
 
				+            if origin in col_dict.keys():
			
 
				+                col_dict[origin].add(col)
			
 
				+            else:
			
 
				+                col_dict[origin] = {col}
			
 
				+
			
 
				+        for origin, cols in col_dict.items():
			
 
				+            print(wind_name, origin, cols)
			
 
				+            if pd.api.types.is_numeric_dtype(df[origin]):
			
 
				+                df[origin] = df[list(cols)].max(axis=1)
			
 
				+            else:
			
 
				+                df[origin] = df[list(cols)].apply(lambda x: [i for i in x.values if i][0], axis=1)
			
 
				+            for col in cols:
			
 
				+                if col != origin:
			
 
				+                    del df[col]
			
 
				+
			
 
				+        df.to_csv(os.path.join(save_path, wind_name + '.csv'), encoding='utf8')
			
 
				+
			
 
				+    except Exception as e:
			
 
				+        print(wind_name, df.columns)
			
 
				+        raise e
			
 
				+
			
 
				+
			
 
				+if __name__ == '__main__':
			
 
				+    begin = datetime.datetime.now()
			
 
				+
			
 
				+    base_path = r'/data/download/collection_data/1进行中/新华水电/收资数据/风机SCADA数据'
			
 
				+
			
 
				+    dir1 = base_path + r'/data'
			
 
				+    dir2 = base_path + r'/故障报警/汇能机组数据-故障'
			
 
				+    dir3 = base_path + r'/故障报警/报警'
			
 
				+    save_path = r'/data/download/collection_data/1进行中/新华水电/清理数据/合并批次1-2故障报警'
			
 
				+
			
 
				+    create_file_path(save_path)
			
 
				+
			
 
				+    # result_datas = [
			
 
				+    #     (r'/data/download/collection_data/1进行中/新华水电/风机SCADA数据',
			
 
				+    #      r'/data/download/collection_data/1进行中/新华水电/整理数据/批次1-2合并'),
			
 
				+    # ]
			
 
				+
			
 
				+    data_wind_name = dict()
			
 
				+    files = read_excel_files(dir1)
			
 
				+    with multiprocessing.Pool(30) as pool:
			
 
				+        datas = pool.starmap(read_fle_to_df, [(file,) for file in files])
			
 
				+    for data in datas:
			
 
				+        check_data, wind_name, df = data[0], data[1], data[2]
			
 
				+        if wind_name in data_wind_name.keys():
			
 
				+            data_wind_name[wind_name].append(df)
			
 
				+        else:
			
 
				+            data_wind_name[wind_name] = [df]
			
 
				+
			
 
				+    with multiprocessing.Pool(30) as pool:
			
 
				+        data_dfs = pool.starmap(combine_df,
			
 
				+                                [(dfs, wind_name, save_path) for wind_name, dfs
			
 
				+                                 in
			
 
				+                                 data_wind_name.items()])
			
 
				+
			
 
				+    result_data_dict = dict()
			
 
				+    for wind_name, df in data_dfs:
			
 
				+        result_data_dict[wind_name] = df
			
 
				+
			
 
				+    for dir4 in [dir2, dir3]:
			
 
				+        guzhang_files = read_excel_files(dir4)
			
 
				+        with multiprocessing.Pool(30) as pool:
			
 
				+            guzhang_datas = pool.starmap(read_guzhangbaojing, [(file,) for file in guzhang_files])
			
 
				+        guzhang_df = pd.DataFrame()
			
 
				+        for df in guzhang_datas:
			
 
				+            if not df.empty:
			
 
				+                guzhang_df = pd.concat([guzhang_df, df])
			
 
				+        wind_names = set(list(guzhang_df['wind_name'].values))
			
 
				+        for wind_name in wind_names:
			
 
				+            now_df = guzhang_df[guzhang_df['wind_name'] == wind_name]
			
 
				+            if wind_name in result_data_dict.keys():
			
 
				+                now_df.reset_index(inplace=True)
			
 
				+                now_df.drop_duplicates(inplace=True, subset=['采样时间', 'wind_name'])
			
 
				+                if 'index' in now_df.columns:
			
 
				+                    del now_df['index']
			
 
				+                now_df.sort_values(by='采样时间', inplace=True)
			
 
				+                now_df.set_index(keys=['采样时间', 'wind_name'], inplace=True)
			
 
				+                res_df = result_data_dict[wind_name]
			
 
				+                result_data_dict[wind_name] = pd.concat([res_df, now_df], axis=1)
			
 
				+
			
 
				+    with multiprocessing.Pool(30) as pool:
			
 
				+        pool.starmap(sae_to_csv, [(wind_name, df) for wind_name, df in result_data_dict.items()])
			
 
				+
			
 
				+    print(datetime.datetime.now() - begin)
			
--- a/tmp_file/select_part_cols.py
+++ b/tmp_file/select_part_cols.py
@@ -0,0 +1,26 @@
 
				+import datetime
			
 
				+import multiprocessing
			
 
				+import os
			
 
				+import pandas as pd
			
 
				+
			
 
				+read_dir = r'/data/download/collection_data/2完成/招远风电场-山东-大唐/清理数据/WOF01000010-WOB000002_ZY24年7-9月秒级/second'
			
 
				+save_dir = r'/data/download/collection_data/2完成/招远风电场-山东-大唐/清理数据/WOF01000010-WOB000002_ZY24年7-9月秒级/second_select_yaw_error1_20241014'
			
 
				+
			
 
				+
			
 
				+def read_and_select_and_save(file):
			
 
				+    df = pd.read_csv(read_dir + os.sep + file,
			
 
				+                     usecols=['active_power', 'wind_velocity', 'pitch_angle_blade_1', 'yaw_error1', 'lab'])
			
 
				+    df = df[df['yaw_error1'] <= 360]
			
 
				+    df['yaw_error1'] = df['yaw_error1'].apply(lambda x: x - 360 if 180 <= x <= 360 else x)
			
 
				+    condition = (df['active_power'] > 0) & (df['wind_velocity'] > 0)
			
 
				+    df = df[condition]
			
 
				+
			
 
				+    df.to_csv(os.path.join(save_dir, file), index=False, encoding='utf8')
			
 
				+    print(f'{file}处理完成')
			
 
				+
			
 
				+
			
 
				+if __name__ == '__main__':
			
 
				+    begin = datetime.datetime.now()
			
 
				+    with multiprocessing.Pool(32) as pool:
			
 
				+        pool.starmap(read_and_select_and_save, [(file,) for file in os.listdir(read_dir)])
			
 
				+    print(f'总耗时:{datetime.datetime.now() - begin}')
			
--- a/tmp_file/对比文件夹列名差值.py
+++ b/tmp_file/对比文件夹列名差值.py
@@ -7,7 +7,7 @@ from utils.file.trans_methods import *
 
				 
			
 
				 
			
 
				 def boolean_is_check_data(df_cols):
			
 
				-    fault_list = ['快速停机', '故障名称', '故障代码', '故障停机', '人工停机', '风机紧急停机', '远方限功率运行状态']
			
 
				+    fault_list = ['快速停机', '故障名称', '故障代码', '故障停机', '人工停机', '风机紧急停机', '工作模式']
			
 
				 
			
 
				     df_cols = [str(i).split('_')[-1] for i in df_cols]
			
 
				     for fault in fault_list:
			
@@ -17,28 +17,27 @@ def boolean_is_check_data(df_cols):
 
				     return False
			
 
				 
			
 
				 
			
 
				-def compareTwoFolders(df1s, other_dfs):
			
 
				-    for is_falut in [True, False]:
			
 
				-        list1 = list()
			
 
				+def compareTwoFolders(list1, other_dfs):
			
 
				+    for is_falut in [True]:
			
 
				         result_df = pd.DataFrame()
			
 
				-        for df1 in df1s:
			
 
				-            tmp_list = [str(i).split('_')[-1] for i in list(df1.columns) if i != 'sheet_name']
			
 
				-            if is_falut:
			
 
				-                if boolean_is_check_data(df1.columns):
			
 
				-                    list1.extend(tmp_list)
			
 
				-            else:
			
 
				-                if not boolean_is_check_data(df1.columns):
			
 
				-                    list1.extend(tmp_list)
			
 
				+        # for df1 in df1s:
			
 
				+        #     tmp_list = [str(i).split('_')[-1] for i in list(df1.columns) if i != 'sheet_name']
			
 
				+        #     if is_falut:
			
 
				+        #         if boolean_is_check_data(df1.columns):
			
 
				+        #             list1.extend(tmp_list)
			
 
				+        #     else:
			
 
				+        #         if not boolean_is_check_data(df1.columns):
			
 
				+        #             list1.extend(tmp_list)
			
 
				 
			
 
				         set1 = set(list1)
			
 
				 
			
 
				         list1 = list(set1)
			
 
				         list1.sort()
			
 
				-        list1.extend([''] * 20)
			
 
				 
			
 
				-        result_df['风机1'] = list1
			
 
				+        result_df['完整列名'] = list1
			
 
				 
			
 
				         for wind_name, dfs in other_dfs.items():
			
 
				+
			
 
				             list2 = list()
			
 
				             for df in dfs:
			
 
				                 tmp_list = [str(i).split('_')[-1] for i in list(df.columns) if i != 'sheet_name']
			
@@ -56,51 +55,48 @@ def compareTwoFolders(df1s, other_dfs):
 
				             list3 = list(set1 - set2)
			
 
				             list3.sort()
			
 
				 
			
 
				-            list4 = list(set2 - set1)
			
 
				-            list4.sort()
			
 
				-            print(list3)
			
 
				-            print(list4)
			
 
				+            # list4 = list(set2 - set1)
			
 
				+            # list4.sort()
			
 
				+            # print(list3)
			
 
				+            # print(list4)
			
 
				 
			
 
				             max_count = len(list1)
			
 
				             list1.extend([''] * (max_count - len(list1)))
			
 
				             list2.extend([''] * (max_count - len(list2)))
			
 
				             list3.extend([''] * (max_count - len(list3)))
			
 
				-            list4.extend([''] * (max_count - len(list4)))
			
 
				+            # list4.extend([''] * (max_count - len(list4)))
			
 
				 
			
 
				-            result_df['风机' + str(wind_name) + '_字段'] = list2
			
 
				-            result_df['风机' + str(wind_name) + '_比风机1少字段'] = list3
			
 
				-            result_df['风机' + str(wind_name) + '_比风机1多字段'] = list4
			
 
				+            result_df[str(wind_name) + '字段'] = list2
			
 
				+            result_df[str(wind_name) + '比完整列名少字段'] = list3
			
 
				+            # result_df['风机' + str(wind_name) + '_比风机1多字段'] = list4
			
 
				 
			
 
				         file_name = 'col_compare.csv' if not is_falut else 'col_compare_fault.csv'
			
 
				-        result_df.to_csv(file_name, encoding='utf-8')
			
 
				+
			
 
				+        result_df.to_csv(file_name, encoding='utf-8', index=False)
			
 
				 
			
 
				 
			
 
				 if __name__ == '__main__':
			
 
				     begin = datetime.datetime.now()
			
 
				-    dir1 = r'D:\data\新华水电\风机SCADA数据\标准'
			
 
				-    dir2 = r'D:\data\新华水电\风机SCADA数据\9月风机数据'
			
 
				-    files1 = read_excel_files(dir1)
			
 
				+    dir2 = r'D:\data\新华水电\风机SCADA数据'
			
 
				     files2 = read_excel_files(dir2)
			
 
				-    with multiprocessing.Pool(10) as pool:
			
 
				-        df1s = pool.starmap(read_file_to_df, [(file, list(), None, 1) for file in files1])
			
 
				 
			
 
				     other_dfs = dict()
			
 
				-    for root, dirs, files in os.walk(dir2):
			
 
				-        if dirs:
			
 
				-            for dir in dirs:
			
 
				-                wind_name = dir.split('#')[0]
			
 
				-                for file in os.listdir(dir2 + os.sep + dir):
			
 
				-                    print(dir, file)
			
 
				-                    df = read_file_to_df(os.path.join(dir2, dir, file), nrows=1)
			
 
				-                    if wind_name in other_dfs.keys():
			
 
				-                        other_dfs[wind_name].append(df)
			
 
				-                    else:
			
 
				-                        other_dfs[wind_name] = [df]
			
 
				+    list1 = list()
			
 
				+    for file in files2:
			
 
				+        month = os.path.basename(os.path.dirname(os.path.dirname(file)))[0:2]
			
 
				+        wind_name = month + os.path.basename(os.path.dirname(file)).split('#')[0] + '号风机'
			
 
				+        df = read_file_to_df(file, nrows=1)
			
 
				+        if boolean_is_check_data(df.columns):
			
 
				+            list1.extend([str(i).split('_')[-1] for i in list(df.columns) if i != 'sheet_name'])
			
 
				+        if wind_name in other_dfs.keys():
			
 
				+            other_dfs[wind_name].append(df)
			
 
				+        else:
			
 
				+            other_dfs[wind_name] = [df]
			
 
				 
			
 
				     # with multiprocessing.Pool(10) as pool:
			
 
				     #     df2s = pool.starmap(read_file_to_df, [(file, list(), None, 1) for file in files2])
			
 
				     #
			
 
				-
			
 
				-    compareTwoFolders(df1s, other_dfs)
			
 
				+    list1 = [i for i in list(set(list1)) if i != 'sheet_name']
			
 
				+    compareTwoFolders(list1, other_dfs)
			
 
				 
			
 
				     print(datetime.datetime.now() - begin)