对比文件夹列名差值.py 3.5 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106
  1. import multiprocessing
  2. import os
  3. import pandas as pd
  4. from utils.file.trans_methods import *
  5. def boolean_is_check_data(df_cols):
  6. fault_list = ['快速停机', '故障名称', '故障代码', '故障停机', '人工停机', '风机紧急停机', '远方限功率运行状态']
  7. df_cols = [str(i).split('_')[-1] for i in df_cols]
  8. for fault in fault_list:
  9. if fault in df_cols:
  10. return True
  11. return False
  12. def compareTwoFolders(df1s, other_dfs):
  13. for is_falut in [True, False]:
  14. list1 = list()
  15. result_df = pd.DataFrame()
  16. for df1 in df1s:
  17. tmp_list = [str(i).split('_')[-1] for i in list(df1.columns) if i != 'sheet_name']
  18. if is_falut:
  19. if boolean_is_check_data(df1.columns):
  20. list1.extend(tmp_list)
  21. else:
  22. if not boolean_is_check_data(df1.columns):
  23. list1.extend(tmp_list)
  24. set1 = set(list1)
  25. list1 = list(set1)
  26. list1.sort()
  27. list1.extend([''] * 20)
  28. result_df['风机1'] = list1
  29. for wind_name, dfs in other_dfs.items():
  30. list2 = list()
  31. for df in dfs:
  32. tmp_list = [str(i).split('_')[-1] for i in list(df.columns) if i != 'sheet_name']
  33. if is_falut:
  34. if boolean_is_check_data(df.columns):
  35. list2.extend(tmp_list)
  36. else:
  37. if not boolean_is_check_data(df.columns):
  38. list2.extend(tmp_list)
  39. set2 = set(list2)
  40. list2 = list(set2)
  41. list2.sort()
  42. list3 = list(set1 - set2)
  43. list3.sort()
  44. list4 = list(set2 - set1)
  45. list4.sort()
  46. print(list3)
  47. print(list4)
  48. max_count = len(list1)
  49. list1.extend([''] * (max_count - len(list1)))
  50. list2.extend([''] * (max_count - len(list2)))
  51. list3.extend([''] * (max_count - len(list3)))
  52. list4.extend([''] * (max_count - len(list4)))
  53. result_df['风机' + str(wind_name) + '_字段'] = list2
  54. result_df['风机' + str(wind_name) + '_比风机1少字段'] = list3
  55. result_df['风机' + str(wind_name) + '_比风机1多字段'] = list4
  56. file_name = 'col_compare.csv' if not is_falut else 'col_compare_fault.csv'
  57. result_df.to_csv(file_name, encoding='utf-8')
  58. if __name__ == '__main__':
  59. begin = datetime.datetime.now()
  60. dir1 = r'D:\data\新华水电\风机SCADA数据\标准'
  61. dir2 = r'D:\data\新华水电\风机SCADA数据\9月风机数据'
  62. files1 = read_excel_files(dir1)
  63. files2 = read_excel_files(dir2)
  64. with multiprocessing.Pool(10) as pool:
  65. df1s = pool.starmap(read_file_to_df, [(file, list(), None, 1) for file in files1])
  66. other_dfs = dict()
  67. for root, dirs, files in os.walk(dir2):
  68. if dirs:
  69. for dir in dirs:
  70. wind_name = dir.split('#')[0]
  71. for file in os.listdir(dir2 + os.sep + dir):
  72. print(dir, file)
  73. df = read_file_to_df(os.path.join(dir2, dir, file), nrows=1)
  74. if wind_name in other_dfs.keys():
  75. other_dfs[wind_name].append(df)
  76. else:
  77. other_dfs[wind_name] = [df]
  78. # with multiprocessing.Pool(10) as pool:
  79. # df2s = pool.starmap(read_file_to_df, [(file, list(), None, 1) for file in files2])
  80. #
  81. compareTwoFolders(df1s, other_dfs)
  82. print(datetime.datetime.now() - begin)