|
@@ -1,12 +1,17 @@
|
|
|
-import datetime
|
|
|
+import shutil
|
|
|
+from datetime import datetime
|
|
|
import json
|
|
|
+import multiprocessing
|
|
|
import os.path
|
|
|
|
|
|
+import pandas as pd
|
|
|
+
|
|
|
+from service.import_data_service import get_exec_group, run_success, run_begin
|
|
|
from trans.ExecParam import ExecParam
|
|
|
-from service.import_data_service import get_exec_group, get_exec_data
|
|
|
from trans.common.ReadAndSaveTmp import ReadAndSaveTmp
|
|
|
from trans.common.UnzipAndRemove import UnzipAndRemove
|
|
|
from utils.conf.read_conf import read_conf
|
|
|
+from utils.file.trans_methods import read_excel_files
|
|
|
from utils.log.import_data_log import log_print
|
|
|
|
|
|
|
|
@@ -23,40 +28,120 @@ class ImportParam(object):
|
|
|
result = results[0]
|
|
|
self.process_group_name = result['name']
|
|
|
self.join_type = result['join_type']
|
|
|
+ self.join_type_strs = [int(i) for i in result['join_type_str'].split(",")] if result[
|
|
|
+ 'join_type_str'] else []
|
|
|
self.process_group_id = data['process_group_id']
|
|
|
self.tmp_root = os.path.join(self.tmp_root, f'{self.executor_history_id}_{self.executor_history_name}')
|
|
|
self.out_put_dir = data['out_put_dir']
|
|
|
self.execotor_list = list()
|
|
|
for exec_data in json.loads(data['input_dirs']):
|
|
|
- self.execotor_list.append(ExecParam(exec_data, self.tmp_root))
|
|
|
+ self.execotor_list.append(ExecParam(exec_data, self.tmp_root, ))
|
|
|
else:
|
|
|
log_print(f"通过ID: {data['id']}没有获取到执行器组")
|
|
|
raise Exception(f"通过ID: {data['id']}没有获取到执行器组")
|
|
|
|
|
|
+ def save_result_dir(self, base_name, file_dicts, index_cols, axis=0, join_type_strs=[]):
|
|
|
+ log_print(f"开始执行执行器合并:{base_name}")
|
|
|
+ begin = datetime.now()
|
|
|
+
|
|
|
+ df = pd.DataFrame()
|
|
|
+ if axis in [0, 1]:
|
|
|
+ if index_cols:
|
|
|
+ dfs = [pd.read_csv(file, encoding='utf-8', index_col=index_cols) for index, file in file_dicts.items()]
|
|
|
+ else:
|
|
|
+ dfs = [pd.read_csv(file, encoding='utf-8') for index, file in file_dicts.items()]
|
|
|
+
|
|
|
+ df = pd.concat(dfs, axis=axis)
|
|
|
+ else:
|
|
|
+ if len(set(self.execotor_list)) == 1:
|
|
|
+ if index_cols:
|
|
|
+ dfs = [pd.read_csv(file, encoding='utf-8', index_col=index_cols) for index, file in
|
|
|
+ file_dicts.items()]
|
|
|
+ else:
|
|
|
+ dfs = [pd.read_csv(file, encoding='utf-8') for index, file in file_dicts.items()]
|
|
|
+
|
|
|
+ df = pd.concat(dfs, axis=axis)
|
|
|
+ else:
|
|
|
+ if index_cols:
|
|
|
+ for join_type_index, join_type in enumerate(join_type_strs):
|
|
|
+ df1 = pd.DataFrame()
|
|
|
+ df2 = pd.DataFrame()
|
|
|
+ if join_type_index in file_dicts.keys():
|
|
|
+ df1 = pd.read_csv(file_dicts[join_type_index], encoding='utf-8', index_col=index_cols)
|
|
|
+
|
|
|
+ if join_type_index + 1 in file_dicts.keys():
|
|
|
+ df2 = pd.read_csv(file_dicts[join_type_index + 1], encoding='utf-8', index_col=index_cols)
|
|
|
+ df = pd.concat([df, df1, df2], axis=self.execotor_list[join_type_index])
|
|
|
+ else:
|
|
|
+ for join_type_index, join_type in enumerate(join_type_strs):
|
|
|
+ df1 = pd.DataFrame()
|
|
|
+ df2 = pd.DataFrame()
|
|
|
+ if join_type_index in file_dicts.keys():
|
|
|
+ df1 = pd.read_csv(file_dicts[join_type_index], encoding='utf-8')
|
|
|
+ if join_type_index + 1 in file_dicts.keys():
|
|
|
+ df2 = pd.read_csv(file_dicts[join_type_index + 1], encoding='utf-8')
|
|
|
+ df = pd.concat([df, df1, df2], axis=self.execotor_list[join_type_index])
|
|
|
+
|
|
|
+ if index_cols:
|
|
|
+ df.reset_index(inplace=True)
|
|
|
+ df.drop_duplicates(subset=index_cols, inplace=True)
|
|
|
+ df.sort_values(by=index_cols, inplace=True)
|
|
|
+
|
|
|
+ df.to_csv(os.path.join(self.out_put_dir, f'{base_name}'), encoding='utf-8', index=False)
|
|
|
+ log_print(f"{base_name}合并结束:{base_name},耗时:", datetime.now() - begin)
|
|
|
+
|
|
|
def run(self):
|
|
|
- total_begin = datetime.datetime.now()
|
|
|
+ total_begin = datetime.now()
|
|
|
log_print(f'开始执行{self.process_group_name}')
|
|
|
+ log_print(f'开始清理临时目录{self.tmp_root}')
|
|
|
+ shutil.rmtree(self.tmp_root, ignore_errors=True)
|
|
|
+ log_print(f'清理临时目录{self.tmp_root}成功')
|
|
|
+
|
|
|
+ run_begin(self.executor_history_id, self.save_db)
|
|
|
|
|
|
process_count = len(self.execotor_list)
|
|
|
for index, exec_data in enumerate(self.execotor_list):
|
|
|
log_print(f'开始执行{exec_data.process_executor}')
|
|
|
|
|
|
- begin = datetime.datetime.now()
|
|
|
+ begin = datetime.now()
|
|
|
# 移动解压文件
|
|
|
unzip_and_remove = UnzipAndRemove(self.executor_history_id, process_count, index,
|
|
|
exec_data.path_param, self.save_db)
|
|
|
unzip_and_remove.run()
|
|
|
- log_print("解压移动文件结束:耗时:", datetime.datetime.now() - begin, "总耗时:",
|
|
|
- datetime.datetime.now() - total_begin)
|
|
|
+ log_print("解压移动文件结束:耗时:", datetime.now() - begin, "总耗时:", datetime.now() - total_begin)
|
|
|
# 读取文件
|
|
|
|
|
|
- begin = datetime.datetime.now()
|
|
|
+ begin = datetime.now()
|
|
|
|
|
|
- read_and_save = ReadAndSaveTmp(self.executor_history_id, process_count, index,
|
|
|
- exec_data,
|
|
|
- self.out_put_dir if process_count == 1 else exec_data.path_param.get_process_tmp_path(),
|
|
|
- self.save_db)
|
|
|
+ read_and_save = ReadAndSaveTmp(self.executor_history_id, process_count, index, exec_data, self.save_db)
|
|
|
|
|
|
read_and_save.run()
|
|
|
- log_print("保存数据到临时文件结束,耗时:", datetime.datetime.now() - begin, "总耗时:",
|
|
|
- datetime.datetime.now() - total_begin)
|
|
|
+ log_print(f"{exec_data.process_executor}保存数据到临时文件结束,耗时:", datetime.now() - begin,
|
|
|
+ "总耗时:", datetime.now() - total_begin)
|
|
|
+
|
|
|
+ log_print(f"{process_count}个执行器全部保存数据到临时文件结束,总耗时", datetime.now() - total_begin)
|
|
|
+
|
|
|
+ log_print(f"开始执行执行器合并")
|
|
|
+ begin = datetime.now()
|
|
|
+
|
|
|
+ if self.join_type in [0, 1]:
|
|
|
+ self.join_type_strs = [self.join_type] * len(self.execotor_list)
|
|
|
+
|
|
|
+ file_map = dict()
|
|
|
+ for index, exec_data in enumerate(self.execotor_list):
|
|
|
+ all_files = read_excel_files(exec_data.path_param.get_process_tmp_path())
|
|
|
+ for file in all_files:
|
|
|
+ base_name = os.path.basename(file)
|
|
|
+ if base_name in file_map:
|
|
|
+ file_map[base_name][index] = file
|
|
|
+ else:
|
|
|
+ file_map[base_name] = {index: file}
|
|
|
+
|
|
|
+ with multiprocessing.Pool(6) as pool:
|
|
|
+ pool.starmap(self.save_result_dir,
|
|
|
+ [(base_name, file_dicts, self.execotor_list[0].index_cols, self.join_type,
|
|
|
+ self.join_type_strs) for base_name, file_dicts in file_map.items()])
|
|
|
+
|
|
|
+ log_print("合并结束,耗时:", datetime.now() - begin, ",总耗时:", datetime.now() - total_begin)
|
|
|
+ run_success(self.executor_history_id)
|
|
|
+ log_print(f"{process_count}个执行器全部合并结束,总耗时", datetime.now() - total_begin)
|