新华水电列名对比.py 3.3 KB

12345678910111213141516171819202122232425262728293031323334353637383940414243444546474849505152535455565758596061626364656667686970717273747576777879808182838485868788899091929394959697
  1. from utils.file.trans_methods import *
  2. from os import path
  3. def boolean_is_check_data(df_cols):
  4. fault_list = ['快速停机', '故障名称', '故障代码', '故障停机', '人工停机', '风机紧急停机', '工作模式']
  5. df_cols = [str(i).split('_')[-1] for i in df_cols]
  6. for fault in fault_list:
  7. if fault in df_cols:
  8. return True
  9. return False
  10. def compareTwoFolders(list1, other_dfs):
  11. for is_falut in [True]:
  12. result_df = pd.DataFrame()
  13. # for df1 in df1s:
  14. # tmp_list = [str(i).split('_')[-1] for i in list(df1.columns) if i != 'sheet_name']
  15. # if is_falut:
  16. # if boolean_is_check_data(df1.columns):
  17. # list1.extend(tmp_list)
  18. # else:
  19. # if not boolean_is_check_data(df1.columns):
  20. # list1.extend(tmp_list)
  21. set1 = set(list1)
  22. list1 = list(set1)
  23. list1.sort()
  24. result_df['完整列名'] = list1
  25. for wind_name, dfs in other_dfs.items():
  26. list2 = list()
  27. for df in dfs:
  28. tmp_list = [str(i).split('_')[-1] for i in list(df.columns) if i != 'sheet_name']
  29. if is_falut:
  30. if boolean_is_check_data(df.columns):
  31. list2.extend(tmp_list)
  32. else:
  33. if not boolean_is_check_data(df.columns):
  34. list2.extend(tmp_list)
  35. set2 = set(list2)
  36. list2 = list(set2)
  37. list2.sort()
  38. list3 = list(set1 - set2)
  39. list3.sort()
  40. # list4 = list(set2 - set1)
  41. # list4.sort()
  42. # print(list3)
  43. # print(list4)
  44. max_count = len(list1)
  45. list1.extend([''] * (max_count - len(list1)))
  46. list2.extend([''] * (max_count - len(list2)))
  47. list3.extend([''] * (max_count - len(list3)))
  48. # list4.extend([''] * (max_count - len(list4)))
  49. result_df[str(wind_name) + '字段'] = list2
  50. result_df[str(wind_name) + '比完整列名少字段'] = list3
  51. # result_df['风机' + str(wind_name) + '_比风机1多字段'] = list4
  52. file_name = 'col_compare.csv' if not is_falut else 'col_compare_fault.csv'
  53. result_df.to_csv(file_name, encoding='utf-8', index=False)
  54. if __name__ == '__main__':
  55. begin = datetime.datetime.now()
  56. dir2 = r'D:\data\新华水电\风机SCADA数据'
  57. files2 = read_excel_files(dir2)
  58. other_dfs = dict()
  59. list1 = list()
  60. for file in files2:
  61. month = path.basename(path.dirname(path.dirname(file)))[0:2]
  62. wind_name = month + path.basename(path.dirname(file)).split('#')[0] + '号风机'
  63. df = read_file_to_df(file, nrows=1)
  64. if boolean_is_check_data(df.columns):
  65. list1.extend([str(i).split('_')[-1] for i in list(df.columns) if i != 'sheet_name'])
  66. if wind_name in other_dfs.keys():
  67. other_dfs[wind_name].append(df)
  68. else:
  69. other_dfs[wind_name] = [df]
  70. # with multiprocessing.Pool(10) as pool:
  71. # df2s = pool.starmap(read_file_to_df, [(file, list(), None, 1) for file in files2])
  72. #
  73. list1 = [i for i in list(set(list1)) if i != 'sheet_name']
  74. compareTwoFolders(list1, other_dfs)
  75. print(datetime.datetime.now() - begin)