WindFarms.py 19 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455
  1. # -*- coding: utf-8 -*-
  2. # @Time : 2024/5/15
  3. # @Author : 魏志亮
  4. import datetime
  5. import multiprocessing
  6. import tempfile
  7. from etl.base.TranseParam import TranseParam
  8. from service.plt_service import get_all_wind, update_trans_status_error, update_trans_status_running, \
  9. update_trans_status_success
  10. from service.trans_service import creat_table_and_add_partition, rename_table, save_df_to_db, save_file_to_db
  11. from utils.file.trans_methods import *
  12. from utils.log.trans_log import logger
  13. from utils.zip.unzip import unzip, unrar
  14. class WindFarms(object):
  15. def __init__(self, name, batch_no=None, field_code=None, params: TranseParam = None, wind_full_name=None,
  16. schedule_exec=True):
  17. self.name = name
  18. self.batch_no = batch_no
  19. self.field_code = field_code
  20. self.wind_full_name = wind_full_name
  21. self.save_zip = False
  22. self.trans_param = params
  23. self.__exist_wind_names = set()
  24. self.wind_col_trans = get_all_wind(self.field_code)
  25. self.batch_count = 50000
  26. self.save_path = None
  27. self.schedule_exec = schedule_exec
  28. self.min_date = None
  29. self.max_date = None
  30. self.total_count = 0
  31. def set_trans_param(self, params: TranseParam):
  32. self.trans_param = params
  33. read_path = str(params.read_path)
  34. if read_path.find(self.wind_full_name) == -1:
  35. message = "读取路径与配置路径不匹配:" + self.trans_param.read_path + ",配置文件为:" + self.wind_full_name
  36. update_trans_status_error(self.batch_no, self.trans_param.read_type, message, self.schedule_exec)
  37. raise ValueError(message)
  38. self.save_path = os.path.join(read_path[0:read_path.find(self.wind_full_name)], self.wind_full_name, "清理数据")
  39. def __params_valid(self, not_null_list=list()):
  40. for arg in not_null_list:
  41. if arg is None or arg == '':
  42. raise Exception("Invalid param set :" + arg)
  43. def __get_save_path(self):
  44. return os.path.join(self.save_path, self.batch_no, self.trans_param.read_type)
  45. def __get_save_tmp_path(self):
  46. return os.path.join(tempfile.gettempdir(), self.wind_full_name, self.batch_no, self.trans_param.read_type)
  47. def __get_excel_tmp_path(self):
  48. return os.path.join(self.__get_save_tmp_path(), 'excel_tmp' + os.sep)
  49. def __get_read_tmp_path(self):
  50. return os.path.join(self.__get_save_tmp_path(), 'read_tmp')
  51. def __df_save_to_tmp_file(self, df=pd.DataFrame(), file=None):
  52. if self.trans_param.is_vertical_table:
  53. pass
  54. else:
  55. # 转换字段
  56. if self.trans_param.cols_tran:
  57. cols_tran = self.trans_param.cols_tran
  58. real_cols_trans = dict()
  59. for k, v in cols_tran.items():
  60. if v and not v.startswith("$"):
  61. real_cols_trans[v] = k
  62. trans_print("包含转换字段,开始处理转换字段")
  63. df.rename(columns=real_cols_trans, inplace=True)
  64. del_keys = set(df.columns) - set(cols_tran.keys())
  65. for key in del_keys:
  66. df.drop(key, axis=1, inplace=True)
  67. df = del_blank(df, ['wind_turbine_number'])
  68. self.__save_to_tmp_csv(df, file)
  69. def __get_excel_files(self):
  70. if os.path.isfile(self.trans_param.read_path):
  71. all_files = [self.trans_param.read_path]
  72. else:
  73. all_files = read_files(self.trans_param.read_path)
  74. to_path = self.__get_excel_tmp_path()
  75. for file in all_files:
  76. if str(file).endswith("zip"):
  77. if str(file).endswith("csv.zip"):
  78. copy_to_new(file, file.replace(self.trans_param.read_path, to_path).replace("csv.zip", 'csv.gz'))
  79. else:
  80. is_success, e = unzip(file, file.replace(self.trans_param.read_path, to_path).split(".")[0])
  81. self.trans_param.has_zip = True
  82. if not is_success:
  83. raise e
  84. elif str(file).endswith("rar"):
  85. is_success, e = unrar(file, file.replace(self.trans_param.read_path, to_path).split(".")[0])
  86. self.trans_param.has_zip = True
  87. if not is_success:
  88. raise e
  89. else:
  90. copy_to_new(file, file.replace(self.trans_param.read_path, to_path))
  91. return read_excel_files(to_path)
  92. def __read_excel_to_df(self, file):
  93. read_cols = [v for k, v in self.trans_param.cols_tran.items() if v and not v.startswith("$")]
  94. trans_dict = {}
  95. for k, v in self.trans_param.cols_tran.items():
  96. if v and not str(v).startswith("$"):
  97. trans_dict[v] = k
  98. if self.trans_param.is_vertical_table:
  99. vertical_cols = self.trans_param.vertical_cols
  100. df = read_file_to_df(file, vertical_cols)
  101. df = df[df[self.trans_param.vertical_key].isin(read_cols)]
  102. df.rename(columns={self.trans_param.cols_tran['wind_turbine_number']: 'wind_turbine_number',
  103. self.trans_param.cols_tran['time_stamp']: 'time_stamp'}, inplace=True)
  104. df[self.trans_param.vertical_key] = df[self.trans_param.vertical_key].map(trans_dict).fillna(
  105. df[self.trans_param.vertical_key])
  106. return df
  107. else:
  108. trans_dict = dict()
  109. for k, v in self.trans_param.cols_tran.items():
  110. if v and v.startswith("$"):
  111. trans_dict[v] = k
  112. if self.trans_param.merge_columns:
  113. df = read_file_to_df(file)
  114. else:
  115. if self.trans_param.need_valid_cols:
  116. df = read_file_to_df(file, read_cols)
  117. else:
  118. df = read_file_to_df(file)
  119. # 处理列名前缀问题
  120. if self.trans_param.resolve_col_prefix:
  121. columns_dict = dict()
  122. for column in df.columns:
  123. columns_dict[column] = eval(self.trans_param.resolve_col_prefix)
  124. df.rename(columns=columns_dict, inplace=True)
  125. for k, v in trans_dict.items():
  126. if k.startswith("$file"):
  127. file_name = ".".join(os.path.basename(file).split(".")[0:-1])
  128. if k == "$file":
  129. df[v] = str(file_name)
  130. else:
  131. datas = str(k.replace("$file", "").replace("[", "").replace("]", "")).split(":")
  132. if len(datas) != 2:
  133. raise Exception("字段映射出现错误 :" + str(trans_dict))
  134. df[v] = str(file_name[int(datas[0]):int(datas[1])]).strip()
  135. elif k.startswith("$folder"):
  136. folder = file
  137. cengshu = int(str(k.replace("$folder", "").replace("[", "").replace("]", "")))
  138. for i in range(cengshu):
  139. folder = os.path.dirname(folder)
  140. df[v] = str(str(folder).split(os.sep)[-1]).strip()
  141. return df
  142. def __save_to_tmp_csv(self, df, file):
  143. trans_print("开始保存", str(file), "到临时文件成功")
  144. names = set(df['wind_turbine_number'].values)
  145. for name in names:
  146. save_name = str(name) + '.csv'
  147. save_path = os.path.join(self.__get_read_tmp_path(), save_name)
  148. create_file_path(save_path, is_file_path=True)
  149. if name in self.__exist_wind_names:
  150. df[df['wind_turbine_number'] == name].to_csv(save_path, index=False, encoding='utf8', mode='a',
  151. header=False)
  152. else:
  153. self.__exist_wind_names.add(name)
  154. df[df['wind_turbine_number'] == name].to_csv(save_path, index=False, encoding='utf8')
  155. self.__set_tongji_data(df)
  156. del df
  157. trans_print("保存", str(names), "到临时文件成功, 风机数量", len(names))
  158. def __set_tongji_data(self, df):
  159. min_date = df['time_stamp'].min()
  160. max_date = df['time_stamp'].max()
  161. if self.min_date is None or self.min_date > min_date:
  162. self.min_date = min_date
  163. if self.max_date is None or self.max_date < max_date:
  164. self.max_date = max_date
  165. self.total_count = self.total_count + df.shape[0]
  166. def save_statistics_file(self):
  167. save_path = os.path.join(os.path.dirname(self.__get_save_path()),
  168. self.trans_param.read_type + '_statistics.txt')
  169. create_file_path(save_path, is_file_path=True)
  170. with open(save_path, 'w', encoding='utf8') as f:
  171. f.write("总数据量:" + str(self.total_count) + "\n")
  172. f.write("最小时间:" + str(self.min_date) + "\n")
  173. f.write("最大时间:" + str(self.max_date) + "\n")
  174. f.write("风机数量:" + str(len(self.__exist_wind_names)) + "\n")
  175. def save_to_csv(self, filename):
  176. df = read_file_to_df(filename)
  177. if self.trans_param.is_vertical_table:
  178. df = df.pivot_table(index=['time_stamp', 'wind_turbine_number'], columns=self.trans_param.vertical_key,
  179. values=self.trans_param.vertical_value,
  180. aggfunc='max')
  181. # 重置索引以得到普通的列
  182. df.reset_index(inplace=True)
  183. for k in self.trans_param.cols_tran.keys():
  184. if k not in df.columns:
  185. df[k] = None
  186. df = df[self.trans_param.cols_tran.keys()]
  187. # 添加年月日
  188. trans_print("包含时间字段,开始处理时间字段,添加年月日", filename)
  189. df['time_stamp'] = pd.to_datetime(df['time_stamp'])
  190. df['year'] = df['time_stamp'].dt.year
  191. df['month'] = df['time_stamp'].dt.month
  192. df['day'] = df['time_stamp'].dt.day
  193. df.sort_values(by='time_stamp', inplace=True)
  194. df['time_stamp'] = df['time_stamp'].apply(
  195. lambda x: x.strftime('%Y-%m-%d %H:%M:%S'))
  196. trans_print("处理时间字段结束")
  197. # 转化风机名称
  198. trans_print("开始转化风机名称")
  199. if self.trans_param.wind_name_exec:
  200. exec_str = f"df['wind_turbine_number'].apply(lambda wind_name: {self.trans_param.wind_name_exec} )"
  201. df['wind_turbine_number'] = eval(exec_str)
  202. df['wind_turbine_number'] = df['wind_turbine_number'].map(
  203. self.wind_col_trans).fillna(
  204. df['wind_turbine_number'])
  205. trans_print("转化风机名称结束")
  206. wind_col_name = str(df['wind_turbine_number'].values[0])
  207. if self.save_zip:
  208. save_path = os.path.join(self.__get_save_path(), str(wind_col_name) + '.csv.gz')
  209. else:
  210. save_path = os.path.join(self.__get_save_path(), str(wind_col_name) + '.csv')
  211. create_file_path(save_path, is_file_path=True)
  212. if self.save_zip:
  213. df.to_csv(save_path, compression='gzip', index=False, encoding='utf-8')
  214. else:
  215. df.to_csv(save_path, index=False, encoding='utf-8')
  216. del df
  217. trans_print("保存" + str(filename) + ".csv成功")
  218. def read_all_files(self):
  219. # 读取文件
  220. try:
  221. all_files = self.__get_excel_files()
  222. trans_print('读取文件数量:', len(all_files))
  223. except Exception as e:
  224. logger.exception(e)
  225. message = "读取文件列表错误:" + self.trans_param.read_path + ",系统返回错误:" + str(e)
  226. update_trans_status_error(self.batch_no, self.trans_param.read_type, message, self.schedule_exec)
  227. raise e
  228. return all_files
  229. def __read_file_and_save_tmp(self):
  230. all_files = self.read_all_files()
  231. if self.trans_param.merge_columns:
  232. # with multiprocessing.Pool(6) as pool:
  233. # dfs = pool.starmap(self.__read_excel_to_df, [(file,) for file in all_files])
  234. dfs = list()
  235. index_keys = [self.trans_param.cols_tran['time_stamp']]
  236. wind_col = self.trans_param.cols_tran['wind_turbine_number']
  237. if str(wind_col).startswith("$"):
  238. wind_col = 'wind_turbine_number'
  239. index_keys.append(wind_col)
  240. df_map = dict()
  241. for file in all_files:
  242. df = self.__read_excel_to_df(file)
  243. key = '-'.join(df.columns)
  244. if key in df_map.keys():
  245. df_map[key] = pd.concat([df_map[key], df])
  246. else:
  247. df_map[key] = df
  248. for k, df in df_map.items():
  249. df.drop_duplicates(inplace=True)
  250. df.set_index(keys=index_keys, inplace=True)
  251. df = df[~df.index.duplicated(keep='first')]
  252. dfs.append(df)
  253. df = pd.concat(dfs, axis=1)
  254. df.reset_index(inplace=True)
  255. names = set(df[wind_col].values)
  256. try:
  257. for name in names:
  258. self.__df_save_to_tmp_file(df[df[wind_col] == name], "")
  259. except Exception as e:
  260. logger.exception(e)
  261. message = "合并列出现错误:" + str(e)
  262. update_trans_status_error(self.batch_no, self.trans_param.read_type, message, self.schedule_exec)
  263. raise e
  264. else:
  265. for file in all_files:
  266. try:
  267. self.__df_save_to_tmp_file(self.__read_excel_to_df(file), file)
  268. except Exception as e:
  269. logger.exception(e)
  270. message = "读取文件错误:" + file + ",系统返回错误:" + str(e)
  271. update_trans_status_error(self.batch_no, self.trans_param.read_type, message, self.schedule_exec)
  272. raise e
  273. def mutiprocessing_to_save_file(self):
  274. # 开始保存到正式文件
  275. trans_print("开始保存到excel文件")
  276. all_tmp_files = read_excel_files(self.__get_read_tmp_path())
  277. try:
  278. with multiprocessing.Pool(6) as pool:
  279. pool.starmap(self.save_to_csv, [(file,) for file in all_tmp_files])
  280. except Exception as e:
  281. logger.exception(e)
  282. message = "保存文件错误,系统返回错误:" + str(e)
  283. update_trans_status_error(self.batch_no, self.trans_param.read_type, message, self.schedule_exec)
  284. raise e
  285. trans_print("结束保存到excel文件")
  286. def mutiprocessing_to_save_db(self):
  287. # 开始保存到SQL文件
  288. trans_print("开始保存到数据库文件")
  289. all_saved_files = read_excel_files(self.__get_save_path())
  290. table_name = self.batch_no + "_" + self.trans_param.read_type
  291. creat_table_and_add_partition(table_name, len(all_saved_files), self.trans_param.read_type)
  292. try:
  293. with multiprocessing.Pool(6) as pool:
  294. pool.starmap(save_file_to_db,
  295. [(table_name, file, self.batch_count) for file in all_saved_files])
  296. except Exception as e:
  297. logger.exception(e)
  298. message = "保存到数据库错误,系统返回错误:" + str(e)
  299. update_trans_status_error(self.batch_no, self.trans_param.read_type, message, self.schedule_exec)
  300. raise e
  301. trans_print("结束保存到数据库文件")
  302. def __rename_file(self):
  303. save_path = self.__get_save_path()
  304. files = os.listdir(save_path)
  305. files.sort(key=lambda x: int(str(x).split(os.sep)[-1].split(".")[0][1:]))
  306. for index, file in enumerate(files):
  307. file_path = os.path.join(save_path, 'F' + str(index + 1).zfill(3) + ".csv.gz")
  308. os.rename(os.path.join(save_path, file), file_path)
  309. def delete_batch_files(self):
  310. trans_print("开始删除已存在的批次文件夹")
  311. if os.path.exists(self.__get_save_path()):
  312. shutil.rmtree(self.__get_save_path())
  313. trans_print("删除已存在的批次文件夹")
  314. def delete_tmp_files(self):
  315. trans_print("开始删除临时文件夹")
  316. if os.path.exists(self.__get_excel_tmp_path()):
  317. shutil.rmtree(self.__get_excel_tmp_path())
  318. if os.path.exists(self.__get_read_tmp_path()):
  319. shutil.rmtree(self.__get_read_tmp_path())
  320. if os.path.exists(self.__get_save_tmp_path()):
  321. shutil.rmtree(self.__get_save_tmp_path())
  322. trans_print("删除临时文件夹删除成功")
  323. def delete_batch_db(self):
  324. table_name = "_".join([self.batch_no, self.trans_param.read_type])
  325. renamed_table_name = "del_" + table_name + "_" + datetime.datetime.now().strftime('%Y%m%d%H%M%S')
  326. rename_table(table_name, renamed_table_name)
  327. def run(self, step=0, end=3):
  328. begin = datetime.datetime.now()
  329. trans_print("开始执行", self.name, self.trans_param.read_type)
  330. if step <= 0 and end >= 0:
  331. tmp_begin = datetime.datetime.now()
  332. trans_print("开始初始化字段")
  333. self.delete_batch_files()
  334. self.delete_tmp_files()
  335. self.delete_batch_db()
  336. self.__params_valid([self.name, self.batch_no, self.field_code, self.save_path, self.trans_param.read_type,
  337. self.trans_param.read_path, self.wind_full_name])
  338. if self.trans_param.resolve_col_prefix:
  339. column = "测试"
  340. eval(self.trans_param.resolve_col_prefix)
  341. if self.trans_param.wind_name_exec:
  342. wind_name = "测试"
  343. eval(self.trans_param.wind_name_exec)
  344. trans_print("初始化字段结束,耗时:", str(datetime.datetime.now() - tmp_begin), ",总耗时:",
  345. str(datetime.datetime.now() - begin))
  346. if step <= 1 and end >= 1:
  347. # 更新运行状态到运行中
  348. tmp_begin = datetime.datetime.now()
  349. trans_print("开始保存到临时文件")
  350. update_trans_status_running(self.batch_no, self.trans_param.read_type, self.schedule_exec)
  351. # 开始读取数据并分类保存临时文件
  352. self.__read_file_and_save_tmp()
  353. trans_print("保存到临时文件结束,耗时:", str(datetime.datetime.now() - tmp_begin), ",总耗时:",
  354. str(datetime.datetime.now() - begin))
  355. if step <= 2 and end >= 2:
  356. tmp_begin = datetime.datetime.now()
  357. trans_print("开始保存到文件")
  358. self.mutiprocessing_to_save_file()
  359. self.save_statistics_file()
  360. trans_print("保存到文件结束,耗时:", str(datetime.datetime.now() - tmp_begin), ",总耗时:",
  361. str(datetime.datetime.now() - begin))
  362. if step <= 3 and end >= 3:
  363. tmp_begin = datetime.datetime.now()
  364. trans_print("开始保存到数据库")
  365. self.mutiprocessing_to_save_db()
  366. trans_print("保存到数据库结束,耗时:", str(datetime.datetime.now() - tmp_begin), ",总耗时:",
  367. str(datetime.datetime.now() - begin))
  368. # 如果end==0 则说明只是进行了验证
  369. if end != 0:
  370. update_trans_status_success(self.batch_no, self.trans_param.read_type,
  371. len(read_excel_files(self.__get_read_tmp_path())), self.schedule_exec)
  372. trans_print("开始执行", self.name, self.trans_param.read_type, ",,总耗时:",
  373. str(datetime.datetime.now() - begin))
  374. self.delete_tmp_files()