zhzn
/
energy-data-trans


			
							12345678910111213141516171819202122232425262728293031323334353637383940414243444546474849505152535455565758596061626364656667686970717273747576777879808182838485868788899091929394959697
							from utils.file.trans_methods import *
from os import path

def boolean_is_check_data(df_cols):
    fault_list = ['快速停机', '故障名称', '故障代码', '故障停机', '人工停机', '风机紧急停机', '工作模式']

    df_cols = [str(i).split('_')[-1] for i in df_cols]
    for fault in fault_list:
        if fault in df_cols:
            return True

    return False


def compareTwoFolders(list1, other_dfs):
    for is_falut in [True]:
        result_df = pd.DataFrame()
        # for df1 in df1s:
        #     tmp_list = [str(i).split('_')[-1] for i in list(df1.columns) if i != 'sheet_name']
        #     if is_falut:
        #         if boolean_is_check_data(df1.columns):
        #             list1.extend(tmp_list)
        #     else:
        #         if not boolean_is_check_data(df1.columns):
        #             list1.extend(tmp_list)

        set1 = set(list1)

        list1 = list(set1)
        list1.sort()

        result_df['完整列名'] = list1

        for wind_name, dfs in other_dfs.items():

            list2 = list()
            for df in dfs:
                tmp_list = [str(i).split('_')[-1] for i in list(df.columns) if i != 'sheet_name']
                if is_falut:
                    if boolean_is_check_data(df.columns):
                        list2.extend(tmp_list)
                else:
                    if not boolean_is_check_data(df.columns):
                        list2.extend(tmp_list)

            set2 = set(list2)
            list2 = list(set2)
            list2.sort()

            list3 = list(set1 - set2)
            list3.sort()

            # list4 = list(set2 - set1)
            # list4.sort()
            # print(list3)
            # print(list4)

            max_count = len(list1)
            list1.extend([''] * (max_count - len(list1)))
            list2.extend([''] * (max_count - len(list2)))
            list3.extend([''] * (max_count - len(list3)))
            # list4.extend([''] * (max_count - len(list4)))

            result_df[str(wind_name) + '字段'] = list2
            result_df[str(wind_name) + '比完整列名少字段'] = list3
            # result_df['风机' + str(wind_name) + '_比风机1多字段'] = list4

        file_name = 'col_compare.csv' if not is_falut else 'col_compare_fault.csv'

        result_df.to_csv(file_name, encoding='utf-8', index=False)


if __name__ == '__main__':
    begin = datetime.datetime.now()
    dir2 = r'D:\data\新华水电\风机SCADA数据'
    files2 = read_excel_files(dir2)

    other_dfs = dict()
    list1 = list()
    for file in files2:
        month = path.basename(path.dirname(path.dirname(file)))[0:2]
        wind_name = month + path.basename(path.dirname(file)).split('#')[0] + '号风机'
        df = read_file_to_df(file, nrows=1)
        if boolean_is_check_data(df.columns):
            list1.extend([str(i).split('_')[-1] for i in list(df.columns) if i != 'sheet_name'])
        if wind_name in other_dfs.keys():
            other_dfs[wind_name].append(df)
        else:
            other_dfs[wind_name] = [df]

    # with multiprocessing.Pool(10) as pool:
    #     df2s = pool.starmap(read_file_to_df, [(file, list(), None, 1) for file in files2])
    #
    list1 = [i for i in list(set(list1)) if i != 'sheet_name']
    compareTwoFolders(list1, other_dfs)

    print(datetime.datetime.now() - begin)