فهرست منبع

故障报警处理没有结束时间的情况,
打标签添加具体日志
功率自动识别是否需要转化为kw

wzl 7 ماه پیش
والد
کامیت
05cdb42415

+ 9 - 6
etl/wind_power/fault_warn/FaultWarnTrans.py

@@ -73,11 +73,13 @@ class FaultWarnTrans(BaseDataTrans):
                 if valid_eval(time_format):
                     eval_str = f"df['begin_time'].apply(lambda error_time: {time_format} )"
                     df['begin_time'] = eval(eval_str)
-                    eval_str = f"df['end_time'].apply(lambda error_time: {time_format} )"
-                    df['end_time'] = eval(eval_str)
+                    if 'end_time' in df.columns:
+                        eval_str = f"df['end_time'].apply(lambda error_time: {time_format} )"
+                        df['end_time'] = eval(eval_str)
 
             df['begin_time'] = pd.to_datetime(df['begin_time'], errors='coerce')
-            df['end_time'] = pd.to_datetime(df['end_time'], errors='coerce')
+            if 'end_time' in df.columns:
+                df['end_time'] = pd.to_datetime(df['end_time'], errors='coerce')
 
             exec_wind_turbine_number = read_conf(conf_map, 'exec_wind_turbine_number')
             if exec_wind_turbine_number:
@@ -106,11 +108,12 @@ class FaultWarnTrans(BaseDataTrans):
             df['wind_turbine_number'] = df['wind_turbine_number'].map(
                 self.wind_col_trans).fillna(df['wind_turbine_number'])
 
-            df['time_diff'] = (df['end_time'] - df['begin_time']).dt.total_seconds()
-            df.loc[df['time_diff'] < 0, 'time_diff'] = np.nan
+            if 'end_time' in df.columns:
+                df['time_diff'] = (df['end_time'] - df['begin_time']).dt.total_seconds()
+                df.loc[df['time_diff'] < 0, 'time_diff'] = np.nan
 
             # 根绝开始时间进行排序
-            df.sort_values(by=['wind_turbine_number', 'begin_time', 'end_time'], inplace=True)
+            df.sort_values(by=['wind_turbine_number', 'begin_time'], inplace=True)
 
             if self.save_zip:
                 save_path = os.path.join(self.pathsAndTable.get_save_path(), str(self.batch_name) + '.csv.gz')

+ 7 - 1
etl/wind_power/min_sec/ClassIdentifier.py

@@ -1,4 +1,5 @@
 import datetime
+import traceback
 
 import numpy as np
 from pandas import DataFrame
@@ -350,6 +351,11 @@ class ClassIdentifier(object):
         # Implement your class identification logic here
         begin = datetime.datetime.now()
         trans_print("打标签开始,风机号:", self.wind_turbine_number, self.df.shape)
-        df = self.identifier()
+        try:
+            df = self.identifier()
+        except Exception as e:
+            trans_print(traceback.format_exc())
+            message = str(e) + ',风机编号:' + self.wind_turbine_number
+            raise Exception('打标签失败:' + message)
         trans_print("打标签结束,", df.shape, ",耗时:", datetime.datetime.now() - begin)
         return df

+ 3 - 1
etl/wind_power/min_sec/StatisticsAndSaveFile.py

@@ -100,7 +100,9 @@ class StatisticsAndSaveFile(object):
         df.sort_values(by='time_stamp', inplace=True)
         df = df[[i for i in self.trans_param.cols_tran.keys() if i in df.columns]]
 
-        # df['active_power'] = df['active_power'] / 1000
+        power = df.sample(int(df.shape[0] / 100))['active_power'].median()
+        if power > 10000:
+            df['active_power'] = df['active_power'] / 1000
         ## 做数据检测前,羡强行处理有功功率
         df = df[df['active_power'] < 5000]
 

+ 19 - 0
service/plt_service.py

@@ -118,6 +118,25 @@ def get_data_by_batch_no_and_type(batch_no, transfer_type):
     return data[0]
 
 
+## 合并多个batch_使用
+def get_hebing_data_by_batch_no_and_type(batch_no, transfer_type):
+    query_exec_sql = f"""
+    SELECT
+        t.*,a.field_name,b.batch_name
+    FROM
+        data_transfer t INNER JOIN wind_field a on t.field_code = a.field_code
+        inner join wind_field_batch b on t.batch_code = b.batch_code
+    WHERE
+         t.trans_sys_status = 1 and t.transfer_state = 1 and t.batch_code = '{batch_no}' and t.transfer_type = '{transfer_type}'
+    AND t.transfer_addr != ''
+    """
+
+    data = plt.execute(query_exec_sql)
+    if type(data) == tuple:
+        return None
+    return data[0]
+
+
 def get_all_wind(field_code):
     query_sql = """
     SELECT t.engine_code,t.engine_name,t.rated_capacity,a.rated_cut_out_windspeed 

+ 8 - 8
test_run_local.py

@@ -41,7 +41,7 @@ def run(data: dict = dict(), save_db=False, step=0, end=4):
 
 
 if __name__ == '__main__':
-    env = 'prod'
+    env = 'dev'
     if len(sys.argv) >= 2:
         env = sys.argv[1]
 
@@ -59,14 +59,14 @@ if __name__ == '__main__':
     begin = datetime.datetime.now()
     data = dict()
 
-    data['batch_code'] = 'WOF085500008-WOB000002'
-    data['batch_name'] = 'HY秒级数据1009'
-    data['transfer_type'] = 'second'
-    data['transfer_addr'] = r'/data/download/collection_data/1进行中/红阳风电场-贵州-大唐/收资数据/sec/7-8'
-    data['field_code'] = 'WOF085500008'
-    data['field_name'] = '红阳风电场'
+    data['batch_code'] = 'xinhuashuidian'
+    data['batch_name'] = '新华水电故障'
+    data['transfer_type'] = 'fault'
+    data['transfer_addr'] = r'D:\data\新华水电\收资数据\故障告警\汇能机组数据-故障'
+    data['field_code'] = 'xinhuashuidian'
+    data['field_name'] = '新华水电'
     try:
-        run(data=data, save_db=False, step=3, end=3)
+        run(data=data, save_db=False, step=0, end=3)
     except Exception as e:
         trans_print(traceback.format_exc())
 

+ 48 - 0
tmp_file/changing_hebing_guzhang.py

@@ -0,0 +1,48 @@
+import copy
+import datetime
+
+import pandas as pd
+
+read_path = r'D:\data\长清\故障记录_20230420_20240419.csv'
+
+df = pd.read_csv(read_path, encoding='gb18030')
+
+df['风机名'] = df['风机名'].apply(lambda wind_name: 'A' + wind_name.replace('号风机', '').zfill(2))
+
+df = df[~df['状态码描述'].isin(['高偏航误差穿越', '手动偏航'])]
+
+df['激活时间'] = pd.to_datetime(df['激活时间'].apply(lambda x: x[0:x.rfind(":")]), errors='coerce')
+df['复位时间'] = pd.to_datetime(df['复位时间'].apply(lambda x: x[0:x.rfind(":")]), errors='coerce')
+
+df.dropna(subset=['激活时间', '复位时间'], inplace=True)
+
+
+def generate_next_10_min(dt):
+    minute = dt.minute
+    chazhi = 10 - int(minute % 10)
+    now = dt + datetime.timedelta(minutes=chazhi)
+    now = now.replace(second=0, microsecond=0)
+
+    return now
+
+
+df['begin_time'] = df['激活时间'].apply(generate_next_10_min)
+df['end_time'] = df['复位时间'].apply(generate_next_10_min)
+
+df['chazhi_count'] = ((df['end_time'] - df['begin_time']).dt.seconds) // 600 + 1
+
+result_df = df[df['chazhi_count'] == 1]
+
+datas = [[]]
+for index, row in df[df['chazhi_count'] > 1].iterrows():
+    for i in range(row['chazhi_count']):
+        data = copy.deepcopy(row.values)
+        data[6] = data[6] + datetime.timedelta(minutes=10 * i)
+        datas.append(data)
+
+now_df = pd.DataFrame(datas, columns=df.columns)
+result_df = pd.concat([result_df, now_df])
+
+result_df.reset_index(inplace=True, drop=True)
+result_df.sort_values(by=['风机名', '激活时间', 'begin_time'], inplace=True)
+result_df.to_csv("故障记录.csv", encoding='utf8')

+ 77 - 0
tmp_file/hebing_muti_batch.py

@@ -0,0 +1,77 @@
+import multiprocessing
+
+import os
+import pandas as pd
+import sys
+
+sys.path.insert(0, os.path.abspath(__file__).split("tmp_file")[0])
+
+
+def hebing_and_save(new_batch_save_path, name, paths):
+    df = pd.DataFrame()
+    for path in paths:
+        now_df = read_file_to_df(path)
+        df = pd.concat([df, now_df])
+
+    df.sort_values(by=['time_stamp'], inplace=True)
+
+    create_file_path(new_batch_save_path)
+    df.to_csv(os.path.join(new_batch_save_path, name), index=False, encoding='utf8')
+
+
+if __name__ == '__main__':
+
+    env = 'prod'
+    if len(sys.argv) >= 2:
+        env = sys.argv[1]
+
+    from utils.conf.read_conf import yaml_conf
+
+    conf_path = os.path.abspath(__file__).split("tmp_file")[0] + f"/conf/etl_config_{env}.yaml"
+    os.environ['ETL_CONF'] = conf_path
+    yaml_config = yaml_conf(conf_path)
+    os.environ['env'] = env
+
+    from utils.file.trans_methods import read_file_to_df, create_file_path
+
+    from etl.wind_power.fault_warn.FaultWarnTrans import FaultWarnTrans
+    from etl.wind_power.min_sec.MinSecTrans import MinSecTrans
+    from service.plt_service import get_hebing_data_by_batch_no_and_type
+
+    save_batch = 'WOF085500008-2-3'
+    save_batch_name = '合并'
+    trans_type = 'second'
+    read_batchs = ['WOF085500008-WOB000003', 'WOF085500008-WOB000002']
+    read_paths = list()
+
+    new_batch_save_path = ''
+
+    for read_data in read_batchs:
+        data = get_hebing_data_by_batch_no_and_type(read_data, trans_type)
+        save_db = True
+
+        exec_process = None
+        if data['transfer_type'] in ['second', 'minute']:
+            exec_process = MinSecTrans(data=data, save_db=save_db)
+
+        if data['transfer_type'] in ['fault', 'warn']:
+            exec_process = FaultWarnTrans(data=data, save_db=save_db)
+
+        if exec_process is None:
+            raise Exception("No exec process")
+
+        read_paths.append(exec_process.pathsAndTable.get_save_path())
+        new_batch_save_path = os.path.join(exec_process.pathsAndTable.save_path, save_batch + "_" + save_batch_name,
+                                           trans_type)
+
+    file_dict = dict()
+
+    for read_path in read_paths:
+        for file in os.listdir(read_path):
+            if file in file_dict:
+                file_dict[file].append(os.path.join(read_path, file))
+            else:
+                file_dict[file] = [os.path.join(read_path, file)]
+
+    with multiprocessing.Pool(len(file_dict.keys())) as pool:
+        pool.starmap(hebing_and_save, [(new_batch_save_path, name, paths) for name, paths in file_dict.items()])

+ 12 - 10
tmp_file/organize_xinhua_files.py

@@ -3,7 +3,9 @@ import datetime
 import os
 import warnings
 
+import numpy as np
 import pandas as pd
+
 warnings.filterwarnings("ignore")
 
 
@@ -50,7 +52,7 @@ def create_file_path(path, is_file_path=False):
 
 
 def boolean_is_check_data(df_cols):
-    fault_list = ['快速停机', '故障名称', '故障代码', '故障停机', '人工停机', '风机紧急停机']
+    fault_list = ['快速停机', '故障名称', '故障代码', '故障停机', '人工停机', '风机紧急停机', '工作模式', '风机自身故障停机', '限功率运行状态']
 
     df_cols = [str(i).split('_')[-1] for i in df_cols]
     for fault in fault_list:
@@ -98,11 +100,11 @@ def save_to_file(dfs, wind_name, save_path='', param='', is_check=False, all_col
 
     result_data_list.append(loss_cols)
 
-    # for col in set(all_cols):
-    #     if col not in df.columns:
-    #         df[col] = np.nan
+    for col in set(all_cols):
+        if col not in df.columns:
+            df[col] = np.nan
 
-    # df.to_csv(os.path.join(save_path, param, wind_name + '.csv'), encoding='utf8', index=False)
+    df.to_csv(os.path.join(save_path, param, wind_name + '.csv'), encoding='utf8', index=False)
 
 
 if __name__ == '__main__':
@@ -146,11 +148,11 @@ if __name__ == '__main__':
                     else:
                         data_wind_name[wind_name] = [df]
 
-        with multiprocessing.Pool(30) as pool:
-            pool.starmap(save_to_file,
-                         [(dfs, wind_name, save_path, "事件数据", True, check_all_cols, result_data_list) for wind_name, dfs
-                          in
-                          check_wind_name.items()])
+        # with multiprocessing.Pool(30) as pool:
+        #     pool.starmap(combine_df,
+        #                  [(dfs, wind_name, save_path, "事件数据", True, check_all_cols, result_data_list) for wind_name, dfs
+        #                   in
+        #                   check_wind_name.items()])
 
         with multiprocessing.Pool(30) as pool:
             pool.starmap(save_to_file,

+ 206 - 0
tmp_file/organize_xinhua_files_data.py

@@ -0,0 +1,206 @@
+import multiprocessing
+import datetime
+import os
+import warnings
+
+import numpy as np
+import pandas as pd
+
+warnings.filterwarnings("ignore")
+
+
+def __build_directory_dict(directory_dict, path, filter_types=None):
+    # 遍历目录下的所有项
+    for item in os.listdir(path):
+        item_path = os.path.join(path, item)
+        if os.path.isdir(item_path):
+            __build_directory_dict(directory_dict, item_path, filter_types=filter_types)
+        elif os.path.isfile(item_path):
+            if path not in directory_dict:
+                directory_dict[path] = []
+            if filter_types is None or len(filter_types) == 0:
+                directory_dict[path].append(item_path)
+            elif str(item_path).split(".")[-1] in filter_types:
+                if str(item_path).count("~$") == 0:
+                    directory_dict[path].append(item_path)
+
+
+# 读取路径下所有的excel文件
+def read_excel_files(read_path):
+    if os.path.isfile(read_path):
+        return [read_path]
+    directory_dict = {}
+    __build_directory_dict(directory_dict, read_path, filter_types=['xls', 'xlsx', 'csv', 'gz'])
+    return [path for paths in directory_dict.values() for path in paths if path]
+
+
+# 创建路径
+def create_file_path(path, is_file_path=False):
+    """
+    创建路径
+    :param path:创建文件夹的路径
+    :param is_file_path: 传入的path是否包含具体的文件名
+    """
+    if is_file_path:
+        path = os.path.dirname(path)
+    if not os.path.exists(path):
+        os.makedirs(path, exist_ok=True)
+
+
+def boolean_is_check_data(df_cols, need_valid=True):
+    if not need_valid:
+        return True
+    fault_list = ['快速停机', '故障名称', '故障代码', '故障停机', '人工停机', '风机紧急停机', '风机自身故障停机', '限功率运行状态']
+    df_cols = [str(i).split('_')[-1] for i in df_cols]
+    for fault in fault_list:
+        if fault in df_cols:
+            return True
+    return False
+
+
+def read_fle_to_df(file_path):
+    df = pd.read_excel(file_path)
+    wind_name = [i for i in df.columns if i.find('_') > -1][0].split('_')[0]
+    df.columns = [i.split('_')[-1] for i in df.columns]
+    df['wind_name'] = wind_name
+    df['采样时间'] = pd.to_datetime(df['采样时间'])
+    df['采样时间'] = df['采样时间'].dt.ceil('T')
+    return boolean_is_check_data(df.columns, file_path.find('批次') > -1), wind_name, df
+
+
+def read_guzhangbaojing(file_path):
+    try:
+        df = pd.read_excel(file_path)
+        df.rename(columns={'风机名': 'wind_name'}, inplace=True)
+        df['采样时间'] = pd.to_datetime(df['采样时间'])
+        df['采样时间'] = df['采样时间'].dt.ceil('T')
+        df = df[(df['采样时间'] >= '2024-08-01 00:00:00') & (df['采样时间'] < '2024-10-01 00:00:00')]
+        return df
+    except Exception as e:
+        print(file_path, e)
+        raise e
+
+
+def combine_df(dfs, wind_name, save_path=''):
+    print(wind_name)
+    cols = list()
+    col_map = dict()
+    try:
+        df = dfs[0]
+        cols.extend(df.columns)
+        for index, now_df in enumerate(dfs):
+            if index > 0:
+                for col in now_df.columns:
+                    if col in cols and col not in ['采样时间', 'wind_name']:
+                        if col in col_map.keys():
+                            count = col_map[col]
+                            col_map[col] = count + 1
+                        else:
+                            count = 1
+                            col_map[col] = 1
+                        now_df.rename(columns={col: col + '__' + str(count)}, inplace=True)
+                df = pd.merge(df, now_df, on=['采样时间', 'wind_name'], how='outer')
+                cols.extend(now_df.columns)
+    except Exception as e:
+        print(wind_name, e)
+        raise e
+    df.reset_index(inplace=True)
+    df.drop_duplicates(inplace=True, subset=['采样时间', 'wind_name'])
+    if 'index' in df.columns:
+        del df['index']
+    create_file_path(save_path)
+    df.sort_values(by='采样时间', inplace=True)
+    df.set_index(keys=['采样时间', 'wind_name'], inplace=True)
+    return wind_name, df
+
+
+def sae_to_csv(wind_name, df):
+    try:
+        col_tuples = [(col.split('__')[0], col) for col in df.columns if col.find('__') > -1]
+        col_dict = dict()
+        for origin, col in col_tuples:
+            if origin in col_dict.keys():
+                col_dict[origin].add(col)
+            else:
+                col_dict[origin] = {col}
+
+        for origin, cols in col_dict.items():
+            print(wind_name, origin, cols)
+            if pd.api.types.is_numeric_dtype(df[origin]):
+                df[origin] = df[list(cols)].max(axis=1)
+            else:
+                df[origin] = df[list(cols)].apply(lambda x: [i for i in x.values if i][0], axis=1)
+            for col in cols:
+                if col != origin:
+                    del df[col]
+
+        df.to_csv(os.path.join(save_path, wind_name + '.csv'), encoding='utf8')
+
+    except Exception as e:
+        print(wind_name, df.columns)
+        raise e
+
+
+if __name__ == '__main__':
+    begin = datetime.datetime.now()
+
+    base_path = r'/data/download/collection_data/1进行中/新华水电/收资数据/风机SCADA数据'
+
+    dir1 = base_path + r'/data'
+    dir2 = base_path + r'/故障报警/汇能机组数据-故障'
+    dir3 = base_path + r'/故障报警/报警'
+    save_path = r'/data/download/collection_data/1进行中/新华水电/清理数据/合并批次1-2故障报警'
+
+    create_file_path(save_path)
+
+    # result_datas = [
+    #     (r'/data/download/collection_data/1进行中/新华水电/风机SCADA数据',
+    #      r'/data/download/collection_data/1进行中/新华水电/整理数据/批次1-2合并'),
+    # ]
+
+    data_wind_name = dict()
+    files = read_excel_files(dir1)
+    with multiprocessing.Pool(30) as pool:
+        datas = pool.starmap(read_fle_to_df, [(file,) for file in files])
+    for data in datas:
+        check_data, wind_name, df = data[0], data[1], data[2]
+        if wind_name in data_wind_name.keys():
+            data_wind_name[wind_name].append(df)
+        else:
+            data_wind_name[wind_name] = [df]
+
+    with multiprocessing.Pool(30) as pool:
+        data_dfs = pool.starmap(combine_df,
+                                [(dfs, wind_name, save_path) for wind_name, dfs
+                                 in
+                                 data_wind_name.items()])
+
+    result_data_dict = dict()
+    for wind_name, df in data_dfs:
+        result_data_dict[wind_name] = df
+
+    for dir4 in [dir2, dir3]:
+        guzhang_files = read_excel_files(dir4)
+        with multiprocessing.Pool(30) as pool:
+            guzhang_datas = pool.starmap(read_guzhangbaojing, [(file,) for file in guzhang_files])
+        guzhang_df = pd.DataFrame()
+        for df in guzhang_datas:
+            if not df.empty:
+                guzhang_df = pd.concat([guzhang_df, df])
+        wind_names = set(list(guzhang_df['wind_name'].values))
+        for wind_name in wind_names:
+            now_df = guzhang_df[guzhang_df['wind_name'] == wind_name]
+            if wind_name in result_data_dict.keys():
+                now_df.reset_index(inplace=True)
+                now_df.drop_duplicates(inplace=True, subset=['采样时间', 'wind_name'])
+                if 'index' in now_df.columns:
+                    del now_df['index']
+                now_df.sort_values(by='采样时间', inplace=True)
+                now_df.set_index(keys=['采样时间', 'wind_name'], inplace=True)
+                res_df = result_data_dict[wind_name]
+                result_data_dict[wind_name] = pd.concat([res_df, now_df], axis=1)
+
+    with multiprocessing.Pool(30) as pool:
+        pool.starmap(sae_to_csv, [(wind_name, df) for wind_name, df in result_data_dict.items()])
+
+    print(datetime.datetime.now() - begin)

+ 26 - 0
tmp_file/select_part_cols.py

@@ -0,0 +1,26 @@
+import datetime
+import multiprocessing
+import os
+import pandas as pd
+
+read_dir = r'/data/download/collection_data/2完成/招远风电场-山东-大唐/清理数据/WOF01000010-WOB000002_ZY24年7-9月秒级/second'
+save_dir = r'/data/download/collection_data/2完成/招远风电场-山东-大唐/清理数据/WOF01000010-WOB000002_ZY24年7-9月秒级/second_select_yaw_error1_20241014'
+
+
+def read_and_select_and_save(file):
+    df = pd.read_csv(read_dir + os.sep + file,
+                     usecols=['active_power', 'wind_velocity', 'pitch_angle_blade_1', 'yaw_error1', 'lab'])
+    df = df[df['yaw_error1'] <= 360]
+    df['yaw_error1'] = df['yaw_error1'].apply(lambda x: x - 360 if 180 <= x <= 360 else x)
+    condition = (df['active_power'] > 0) & (df['wind_velocity'] > 0)
+    df = df[condition]
+
+    df.to_csv(os.path.join(save_dir, file), index=False, encoding='utf8')
+    print(f'{file}处理完成')
+
+
+if __name__ == '__main__':
+    begin = datetime.datetime.now()
+    with multiprocessing.Pool(32) as pool:
+        pool.starmap(read_and_select_and_save, [(file,) for file in os.listdir(read_dir)])
+    print(f'总耗时:{datetime.datetime.now() - begin}')

+ 37 - 41
tmp_file/对比文件夹列名差值.py

@@ -7,7 +7,7 @@ from utils.file.trans_methods import *
 
 
 def boolean_is_check_data(df_cols):
-    fault_list = ['快速停机', '故障名称', '故障代码', '故障停机', '人工停机', '风机紧急停机', '远方限功率运行状态']
+    fault_list = ['快速停机', '故障名称', '故障代码', '故障停机', '人工停机', '风机紧急停机', '工作模式']
 
     df_cols = [str(i).split('_')[-1] for i in df_cols]
     for fault in fault_list:
@@ -17,28 +17,27 @@ def boolean_is_check_data(df_cols):
     return False
 
 
-def compareTwoFolders(df1s, other_dfs):
-    for is_falut in [True, False]:
-        list1 = list()
+def compareTwoFolders(list1, other_dfs):
+    for is_falut in [True]:
         result_df = pd.DataFrame()
-        for df1 in df1s:
-            tmp_list = [str(i).split('_')[-1] for i in list(df1.columns) if i != 'sheet_name']
-            if is_falut:
-                if boolean_is_check_data(df1.columns):
-                    list1.extend(tmp_list)
-            else:
-                if not boolean_is_check_data(df1.columns):
-                    list1.extend(tmp_list)
+        # for df1 in df1s:
+        #     tmp_list = [str(i).split('_')[-1] for i in list(df1.columns) if i != 'sheet_name']
+        #     if is_falut:
+        #         if boolean_is_check_data(df1.columns):
+        #             list1.extend(tmp_list)
+        #     else:
+        #         if not boolean_is_check_data(df1.columns):
+        #             list1.extend(tmp_list)
 
         set1 = set(list1)
 
         list1 = list(set1)
         list1.sort()
-        list1.extend([''] * 20)
 
-        result_df['风机1'] = list1
+        result_df['完整列名'] = list1
 
         for wind_name, dfs in other_dfs.items():
+
             list2 = list()
             for df in dfs:
                 tmp_list = [str(i).split('_')[-1] for i in list(df.columns) if i != 'sheet_name']
@@ -56,51 +55,48 @@ def compareTwoFolders(df1s, other_dfs):
             list3 = list(set1 - set2)
             list3.sort()
 
-            list4 = list(set2 - set1)
-            list4.sort()
-            print(list3)
-            print(list4)
+            # list4 = list(set2 - set1)
+            # list4.sort()
+            # print(list3)
+            # print(list4)
 
             max_count = len(list1)
             list1.extend([''] * (max_count - len(list1)))
             list2.extend([''] * (max_count - len(list2)))
             list3.extend([''] * (max_count - len(list3)))
-            list4.extend([''] * (max_count - len(list4)))
+            # list4.extend([''] * (max_count - len(list4)))
 
-            result_df['风机' + str(wind_name) + '_字段'] = list2
-            result_df['风机' + str(wind_name) + '_比风机1少字段'] = list3
-            result_df['风机' + str(wind_name) + '_比风机1多字段'] = list4
+            result_df[str(wind_name) + '字段'] = list2
+            result_df[str(wind_name) + '比完整列名少字段'] = list3
+            # result_df['风机' + str(wind_name) + '_比风机1多字段'] = list4
 
         file_name = 'col_compare.csv' if not is_falut else 'col_compare_fault.csv'
-        result_df.to_csv(file_name, encoding='utf-8')
+
+        result_df.to_csv(file_name, encoding='utf-8', index=False)
 
 
 if __name__ == '__main__':
     begin = datetime.datetime.now()
-    dir1 = r'D:\data\新华水电\风机SCADA数据\标准'
-    dir2 = r'D:\data\新华水电\风机SCADA数据\9月风机数据'
-    files1 = read_excel_files(dir1)
+    dir2 = r'D:\data\新华水电\风机SCADA数据'
     files2 = read_excel_files(dir2)
-    with multiprocessing.Pool(10) as pool:
-        df1s = pool.starmap(read_file_to_df, [(file, list(), None, 1) for file in files1])
 
     other_dfs = dict()
-    for root, dirs, files in os.walk(dir2):
-        if dirs:
-            for dir in dirs:
-                wind_name = dir.split('#')[0]
-                for file in os.listdir(dir2 + os.sep + dir):
-                    print(dir, file)
-                    df = read_file_to_df(os.path.join(dir2, dir, file), nrows=1)
-                    if wind_name in other_dfs.keys():
-                        other_dfs[wind_name].append(df)
-                    else:
-                        other_dfs[wind_name] = [df]
+    list1 = list()
+    for file in files2:
+        month = os.path.basename(os.path.dirname(os.path.dirname(file)))[0:2]
+        wind_name = month + os.path.basename(os.path.dirname(file)).split('#')[0] + '号风机'
+        df = read_file_to_df(file, nrows=1)
+        if boolean_is_check_data(df.columns):
+            list1.extend([str(i).split('_')[-1] for i in list(df.columns) if i != 'sheet_name'])
+        if wind_name in other_dfs.keys():
+            other_dfs[wind_name].append(df)
+        else:
+            other_dfs[wind_name] = [df]
 
     # with multiprocessing.Pool(10) as pool:
     #     df2s = pool.starmap(read_file_to_df, [(file, list(), None, 1) for file in files2])
     #
-
-    compareTwoFolders(df1s, other_dfs)
+    list1 = [i for i in list(set(list1)) if i != 'sheet_name']
+    compareTwoFolders(list1, other_dfs)
 
     print(datetime.datetime.now() - begin)