|
@@ -4,20 +4,20 @@
|
|
import datetime
|
|
import datetime
|
|
import multiprocessing
|
|
import multiprocessing
|
|
import tempfile
|
|
import tempfile
|
|
|
|
+import traceback
|
|
|
|
|
|
from etl.base.TranseParam import TranseParam
|
|
from etl.base.TranseParam import TranseParam
|
|
from service.plt_service import get_all_wind, update_trans_status_error, update_trans_status_running, \
|
|
from service.plt_service import get_all_wind, update_trans_status_error, update_trans_status_running, \
|
|
update_trans_status_success
|
|
update_trans_status_success
|
|
-from service.trans_service import creat_table_and_add_partition, rename_table, save_file_to_db
|
|
|
|
|
|
+from service.trans_service import creat_table_and_add_partition, rename_table, save_file_to_db, drop_table
|
|
from utils.file.trans_methods import *
|
|
from utils.file.trans_methods import *
|
|
-from utils.log.trans_log import logger
|
|
|
|
from utils.zip.unzip import unzip, unrar, get_desc_path
|
|
from utils.zip.unzip import unzip, unrar, get_desc_path
|
|
|
|
|
|
|
|
|
|
class WindFarms(object):
|
|
class WindFarms(object):
|
|
|
|
|
|
def __init__(self, batch_no=None, field_code=None, params: TranseParam = None, wind_full_name=None,
|
|
def __init__(self, batch_no=None, field_code=None, params: TranseParam = None, wind_full_name=None,
|
|
- save_db=True):
|
|
|
|
|
|
+ save_db=True, header=0):
|
|
self.batch_no = batch_no
|
|
self.batch_no = batch_no
|
|
self.field_code = field_code
|
|
self.field_code = field_code
|
|
self.wind_full_name = wind_full_name
|
|
self.wind_full_name = wind_full_name
|
|
@@ -30,6 +30,7 @@ class WindFarms(object):
|
|
self.save_db = save_db
|
|
self.save_db = save_db
|
|
self.lock = multiprocessing.Manager().Lock()
|
|
self.lock = multiprocessing.Manager().Lock()
|
|
self.statistics_map = multiprocessing.Manager().dict()
|
|
self.statistics_map = multiprocessing.Manager().dict()
|
|
|
|
+ self.header = header
|
|
|
|
|
|
def set_trans_param(self, params: TranseParam):
|
|
def set_trans_param(self, params: TranseParam):
|
|
self.trans_param = params
|
|
self.trans_param = params
|
|
@@ -81,9 +82,14 @@ class WindFarms(object):
|
|
df.drop(key, axis=1, inplace=True)
|
|
df.drop(key, axis=1, inplace=True)
|
|
|
|
|
|
df = del_blank(df, ['wind_turbine_number'])
|
|
df = del_blank(df, ['wind_turbine_number'])
|
|
|
|
+ df = df[df['time_stamp'].isna() == False]
|
|
|
|
+ if self.trans_param.wind_name_exec:
|
|
|
|
+ exec_str = f"df['wind_turbine_number'].apply(lambda wind_name: {self.trans_param.wind_name_exec} )"
|
|
|
|
+ df['wind_turbine_number'] = eval(exec_str)
|
|
|
|
+
|
|
self.save_to_tmp_csv(df, file)
|
|
self.save_to_tmp_csv(df, file)
|
|
|
|
|
|
- def get_and_remove(self, file):
|
|
|
|
|
|
+ def get_and_remove(self, file, thead_local=None):
|
|
|
|
|
|
to_path = self.get_excel_tmp_path()
|
|
to_path = self.get_excel_tmp_path()
|
|
if str(file).endswith("zip"):
|
|
if str(file).endswith("zip"):
|
|
@@ -94,19 +100,21 @@ class WindFarms(object):
|
|
is_success, e = unzip(file, get_desc_path(desc_path))
|
|
is_success, e = unzip(file, get_desc_path(desc_path))
|
|
self.trans_param.has_zip = True
|
|
self.trans_param.has_zip = True
|
|
if not is_success:
|
|
if not is_success:
|
|
- raise e
|
|
|
|
|
|
+ # raise e
|
|
|
|
+ pass
|
|
elif str(file).endswith("rar"):
|
|
elif str(file).endswith("rar"):
|
|
desc_path = file.replace(self.trans_param.read_path, to_path)
|
|
desc_path = file.replace(self.trans_param.read_path, to_path)
|
|
is_success, e = unrar(file, get_desc_path(desc_path))
|
|
is_success, e = unrar(file, get_desc_path(desc_path))
|
|
self.trans_param.has_zip = True
|
|
self.trans_param.has_zip = True
|
|
if not is_success:
|
|
if not is_success:
|
|
- raise e
|
|
|
|
|
|
+ # raise e
|
|
|
|
+ pass
|
|
else:
|
|
else:
|
|
copy_to_new(file, file.replace(self.trans_param.read_path, to_path))
|
|
copy_to_new(file, file.replace(self.trans_param.read_path, to_path))
|
|
|
|
|
|
- def read_excel_to_df(self, file):
|
|
|
|
|
|
+ def read_excel_to_df(self, file_path):
|
|
|
|
|
|
- read_cols = [v for k, v in self.trans_param.cols_tran.items() if v and not v.startswith("$")]
|
|
|
|
|
|
+ read_cols = [v.split(",")[0] for k, v in self.trans_param.cols_tran.items() if v and not v.startswith("$")]
|
|
|
|
|
|
trans_dict = {}
|
|
trans_dict = {}
|
|
for k, v in self.trans_param.cols_tran.items():
|
|
for k, v in self.trans_param.cols_tran.items():
|
|
@@ -115,11 +123,10 @@ class WindFarms(object):
|
|
|
|
|
|
if self.trans_param.is_vertical_table:
|
|
if self.trans_param.is_vertical_table:
|
|
vertical_cols = self.trans_param.vertical_cols
|
|
vertical_cols = self.trans_param.vertical_cols
|
|
- df = read_file_to_df(file, vertical_cols)
|
|
|
|
|
|
+ df = read_file_to_df(file_path, vertical_cols, header=self.header)
|
|
df = df[df[self.trans_param.vertical_key].isin(read_cols)]
|
|
df = df[df[self.trans_param.vertical_key].isin(read_cols)]
|
|
df.rename(columns={self.trans_param.cols_tran['wind_turbine_number']: 'wind_turbine_number',
|
|
df.rename(columns={self.trans_param.cols_tran['wind_turbine_number']: 'wind_turbine_number',
|
|
self.trans_param.cols_tran['time_stamp']: 'time_stamp'}, inplace=True)
|
|
self.trans_param.cols_tran['time_stamp']: 'time_stamp'}, inplace=True)
|
|
-
|
|
|
|
df[self.trans_param.vertical_key] = df[self.trans_param.vertical_key].map(trans_dict).fillna(
|
|
df[self.trans_param.vertical_key] = df[self.trans_param.vertical_key].map(trans_dict).fillna(
|
|
df[self.trans_param.vertical_key])
|
|
df[self.trans_param.vertical_key])
|
|
|
|
|
|
@@ -128,16 +135,16 @@ class WindFarms(object):
|
|
else:
|
|
else:
|
|
trans_dict = dict()
|
|
trans_dict = dict()
|
|
for k, v in self.trans_param.cols_tran.items():
|
|
for k, v in self.trans_param.cols_tran.items():
|
|
- if v and v.startswith("$"):
|
|
|
|
|
|
+ if v and v.startswith("$") or v.find(",") > 0:
|
|
trans_dict[v] = k
|
|
trans_dict[v] = k
|
|
|
|
|
|
if self.trans_param.merge_columns:
|
|
if self.trans_param.merge_columns:
|
|
- df = read_file_to_df(file)
|
|
|
|
|
|
+ df = read_file_to_df(file_path, header=self.header)
|
|
else:
|
|
else:
|
|
if self.trans_param.need_valid_cols:
|
|
if self.trans_param.need_valid_cols:
|
|
- df = read_file_to_df(file, read_cols)
|
|
|
|
|
|
+ df = read_file_to_df(file_path, read_cols, header=self.header)
|
|
else:
|
|
else:
|
|
- df = read_file_to_df(file)
|
|
|
|
|
|
+ df = read_file_to_df(file_path, header=self.header)
|
|
|
|
|
|
# 处理列名前缀问题
|
|
# 处理列名前缀问题
|
|
if self.trans_param.resolve_col_prefix:
|
|
if self.trans_param.resolve_col_prefix:
|
|
@@ -148,16 +155,23 @@ class WindFarms(object):
|
|
|
|
|
|
for k, v in trans_dict.items():
|
|
for k, v in trans_dict.items():
|
|
if k.startswith("$file"):
|
|
if k.startswith("$file"):
|
|
- file_name = ".".join(os.path.basename(file).split(".")[0:-1])
|
|
|
|
|
|
+ file = ".".join(os.path.basename(file_path).split(".")[0:-1])
|
|
if k == "$file":
|
|
if k == "$file":
|
|
- df[v] = str(file_name)
|
|
|
|
- else:
|
|
|
|
|
|
+ df[v] = str(file)
|
|
|
|
+ elif k.startswith("$file["):
|
|
datas = str(k.replace("$file", "").replace("[", "").replace("]", "")).split(":")
|
|
datas = str(k.replace("$file", "").replace("[", "").replace("]", "")).split(":")
|
|
if len(datas) != 2:
|
|
if len(datas) != 2:
|
|
raise Exception("字段映射出现错误 :" + str(trans_dict))
|
|
raise Exception("字段映射出现错误 :" + str(trans_dict))
|
|
- df[v] = str(file_name[int(datas[0]):int(datas[1])]).strip()
|
|
|
|
|
|
+ df[v] = str(file[int(datas[0]):int(datas[1])]).strip()
|
|
|
|
+ elif k.find("$file_date") > 0:
|
|
|
|
+ datas = str(k.split(",")[1].replace("$file_date", "").replace("[", "").replace("]", "")).split(":")
|
|
|
|
+ if len(datas) != 2:
|
|
|
|
+ raise Exception("字段映射出现错误 :" + str(trans_dict))
|
|
|
|
+ date_str = str(file[int(datas[0]):int(datas[1])]).strip()
|
|
|
|
+ df[v] = df[k.split(",")[0]].apply(lambda x: date_str + " " + str(x))
|
|
|
|
+
|
|
elif k.startswith("$folder"):
|
|
elif k.startswith("$folder"):
|
|
- folder = file
|
|
|
|
|
|
+ folder = file_path
|
|
cengshu = int(str(k.replace("$folder", "").replace("[", "").replace("]", "")))
|
|
cengshu = int(str(k.replace("$folder", "").replace("[", "").replace("]", "")))
|
|
for i in range(cengshu):
|
|
for i in range(cengshu):
|
|
folder = os.path.dirname(folder)
|
|
folder = os.path.dirname(folder)
|
|
@@ -243,8 +257,22 @@ class WindFarms(object):
|
|
|
|
|
|
df = df[self.trans_param.cols_tran.keys()]
|
|
df = df[self.trans_param.cols_tran.keys()]
|
|
|
|
|
|
|
|
+ # 转化风机名称
|
|
|
|
+ trans_print("开始转化风机名称")
|
|
|
|
+ # if self.trans_param.wind_name_exec:
|
|
|
|
+ # exec_str = f"df['wind_turbine_number'].apply(lambda wind_name: {self.trans_param.wind_name_exec} )"
|
|
|
|
+ # df['wind_turbine_number'] = eval(exec_str)
|
|
|
|
+ df['wind_turbine_number'] = df['wind_turbine_number'].astype('str')
|
|
|
|
+ df['wind_turbine_number'] = df['wind_turbine_number'].map(
|
|
|
|
+ self.wind_col_trans).fillna(
|
|
|
|
+ df['wind_turbine_number'])
|
|
|
|
+
|
|
|
|
+ wind_col_name = str(df['wind_turbine_number'].values[0])
|
|
# 添加年月日
|
|
# 添加年月日
|
|
- trans_print("包含时间字段,开始处理时间字段,添加年月日", filename)
|
|
|
|
|
|
+ trans_print(wind_col_name, "包含时间字段,开始处理时间字段,添加年月日", filename)
|
|
|
|
+ trans_print(wind_col_name, "时间原始大小:", df.shape[0])
|
|
|
|
+ df = df[(df['time_stamp'].str.find('-') > 0) & (df['time_stamp'].str.find(':') > 0)]
|
|
|
|
+ trans_print(wind_col_name, "去掉非法时间后大小:", df.shape[0])
|
|
df['time_stamp'] = pd.to_datetime(df['time_stamp'])
|
|
df['time_stamp'] = pd.to_datetime(df['time_stamp'])
|
|
df['year'] = df['time_stamp'].dt.year
|
|
df['year'] = df['time_stamp'].dt.year
|
|
df['month'] = df['time_stamp'].dt.month
|
|
df['month'] = df['time_stamp'].dt.month
|
|
@@ -254,18 +282,13 @@ class WindFarms(object):
|
|
lambda x: x.strftime('%Y-%m-%d %H:%M:%S'))
|
|
lambda x: x.strftime('%Y-%m-%d %H:%M:%S'))
|
|
trans_print("处理时间字段结束")
|
|
trans_print("处理时间字段结束")
|
|
|
|
|
|
- # 转化风机名称
|
|
|
|
- trans_print("开始转化风机名称")
|
|
|
|
- if self.trans_param.wind_name_exec:
|
|
|
|
- exec_str = f"df['wind_turbine_number'].apply(lambda wind_name: {self.trans_param.wind_name_exec} )"
|
|
|
|
- df['wind_turbine_number'] = eval(exec_str)
|
|
|
|
-
|
|
|
|
- df['wind_turbine_number'] = df['wind_turbine_number'].map(
|
|
|
|
- self.wind_col_trans).fillna(
|
|
|
|
- df['wind_turbine_number'])
|
|
|
|
- trans_print("转化风机名称结束")
|
|
|
|
|
|
+ # 如果包含*号,祛除
|
|
|
|
+ trans_print(wind_col_name, "过滤星号前大小:", df.shape[0])
|
|
|
|
+ mask = ~df.applymap(lambda x: isinstance(x, str) and '*' in x).any(axis=1)
|
|
|
|
+ df = df[mask]
|
|
|
|
+ trans_print(wind_col_name, "过滤星号后大小:", df.shape[0])
|
|
|
|
|
|
- wind_col_name = str(df['wind_turbine_number'].values[0])
|
|
|
|
|
|
+ trans_print(wind_col_name, "转化风机名称结束")
|
|
|
|
|
|
if self.save_zip:
|
|
if self.save_zip:
|
|
save_path = os.path.join(self.get_save_path(), str(wind_col_name) + '.csv.gz')
|
|
save_path = os.path.join(self.get_save_path(), str(wind_col_name) + '.csv.gz')
|
|
@@ -297,14 +320,12 @@ class WindFarms(object):
|
|
|
|
|
|
trans_print('读取文件数量:', len(all_files))
|
|
trans_print('读取文件数量:', len(all_files))
|
|
except Exception as e:
|
|
except Exception as e:
|
|
- logger.exception(e)
|
|
|
|
|
|
+ trans_print(traceback.format_exc())
|
|
message = "读取文件列表错误:" + self.trans_param.read_path + ",系统返回错误:" + str(e)
|
|
message = "读取文件列表错误:" + self.trans_param.read_path + ",系统返回错误:" + str(e)
|
|
- update_trans_status_error(self.batch_no, self.trans_param.read_type, message, self.save_db)
|
|
|
|
- raise e
|
|
|
|
|
|
+ raise ValueError(message)
|
|
return all_files
|
|
return all_files
|
|
|
|
|
|
def read_file_and_save_tmp(self):
|
|
def read_file_and_save_tmp(self):
|
|
-
|
|
|
|
all_files = read_excel_files(self.get_excel_tmp_path())
|
|
all_files = read_excel_files(self.get_excel_tmp_path())
|
|
if self.trans_param.merge_columns:
|
|
if self.trans_param.merge_columns:
|
|
dfs_list = list()
|
|
dfs_list = list()
|
|
@@ -335,25 +356,23 @@ class WindFarms(object):
|
|
try:
|
|
try:
|
|
self.df_save_to_tmp_file(df, "")
|
|
self.df_save_to_tmp_file(df, "")
|
|
except Exception as e:
|
|
except Exception as e:
|
|
- logger.exception(e)
|
|
|
|
|
|
+ trans_print(traceback.format_exc())
|
|
message = "合并列出现错误:" + str(e)
|
|
message = "合并列出现错误:" + str(e)
|
|
- update_trans_status_error(self.batch_no, self.trans_param.read_type, message, self.save_db)
|
|
|
|
- raise e
|
|
|
|
|
|
+ raise ValueError(message)
|
|
|
|
|
|
else:
|
|
else:
|
|
- all_arrays = split_array(all_files, 6)
|
|
|
|
|
|
+ split_count = 6
|
|
|
|
+ all_arrays = split_array(all_files, split_count)
|
|
for arr in all_arrays:
|
|
for arr in all_arrays:
|
|
- with multiprocessing.Pool(6) as pool:
|
|
|
|
|
|
+ with multiprocessing.Pool(split_count) as pool:
|
|
dfs = pool.starmap(self.read_excel_to_df, [(ar,) for ar in arr])
|
|
dfs = pool.starmap(self.read_excel_to_df, [(ar,) for ar in arr])
|
|
try:
|
|
try:
|
|
for df in dfs:
|
|
for df in dfs:
|
|
self.df_save_to_tmp_file(df)
|
|
self.df_save_to_tmp_file(df)
|
|
except Exception as e:
|
|
except Exception as e:
|
|
- logger.exception(e)
|
|
|
|
|
|
+ trans_print(traceback.format_exc())
|
|
message = "整理临时文件,系统返回错误:" + str(e)
|
|
message = "整理临时文件,系统返回错误:" + str(e)
|
|
- update_trans_status_error(self.batch_no, self.trans_param.read_type, message,
|
|
|
|
- self.save_db)
|
|
|
|
- raise e
|
|
|
|
|
|
+ raise ValueError(message)
|
|
|
|
|
|
def mutiprocessing_to_save_file(self):
|
|
def mutiprocessing_to_save_file(self):
|
|
# 开始保存到正式文件
|
|
# 开始保存到正式文件
|
|
@@ -362,12 +381,10 @@ class WindFarms(object):
|
|
try:
|
|
try:
|
|
with multiprocessing.Pool(6) as pool:
|
|
with multiprocessing.Pool(6) as pool:
|
|
pool.starmap(self.save_to_csv, [(file,) for file in all_tmp_files])
|
|
pool.starmap(self.save_to_csv, [(file,) for file in all_tmp_files])
|
|
-
|
|
|
|
except Exception as e:
|
|
except Exception as e:
|
|
- logger.exception(e)
|
|
|
|
|
|
+ trans_print(traceback.format_exc())
|
|
message = "保存文件错误,系统返回错误:" + str(e)
|
|
message = "保存文件错误,系统返回错误:" + str(e)
|
|
- update_trans_status_error(self.batch_no, self.trans_param.read_type, message, self.save_db)
|
|
|
|
- raise e
|
|
|
|
|
|
+ raise ValueError(message)
|
|
|
|
|
|
trans_print("结束保存到excel文件")
|
|
trans_print("结束保存到excel文件")
|
|
|
|
|
|
@@ -384,10 +401,9 @@ class WindFarms(object):
|
|
[(table_name, file, self.batch_count) for file in all_saved_files])
|
|
[(table_name, file, self.batch_count) for file in all_saved_files])
|
|
|
|
|
|
except Exception as e:
|
|
except Exception as e:
|
|
- logger.exception(e)
|
|
|
|
|
|
+ trans_print(traceback.format_exc())
|
|
message = "保存到数据库错误,系统返回错误:" + str(e)
|
|
message = "保存到数据库错误,系统返回错误:" + str(e)
|
|
- update_trans_status_error(self.batch_no, self.trans_param.read_type, message, self.save_db)
|
|
|
|
- raise e
|
|
|
|
|
|
+ raise ValueError(message)
|
|
trans_print("结束保存到数据库文件")
|
|
trans_print("结束保存到数据库文件")
|
|
|
|
|
|
def _rename_file(self):
|
|
def _rename_file(self):
|
|
@@ -414,9 +430,11 @@ class WindFarms(object):
|
|
trans_print("删除临时文件夹删除成功")
|
|
trans_print("删除临时文件夹删除成功")
|
|
|
|
|
|
def delete_batch_db(self):
|
|
def delete_batch_db(self):
|
|
- table_name = "_".join([self.batch_no, self.trans_param.read_type])
|
|
|
|
- renamed_table_name = "del_" + table_name + "_" + datetime.datetime.now().strftime('%Y%m%d%H%M%S')
|
|
|
|
- rename_table(table_name, renamed_table_name)
|
|
|
|
|
|
+ if self.save_db:
|
|
|
|
+ table_name = "_".join([self.batch_no, self.trans_param.read_type])
|
|
|
|
+ renamed_table_name = "del_" + table_name + "_" + datetime.datetime.now().strftime('%Y%m%d%H%M%S')
|
|
|
|
+ # rename_table(table_name, renamed_table_name, self.save_db)
|
|
|
|
+ drop_table(table_name, self.save_db)
|
|
|
|
|
|
def run(self, step=0, end=3):
|
|
def run(self, step=0, end=3):
|
|
begin = datetime.datetime.now()
|
|
begin = datetime.datetime.now()
|
|
@@ -433,13 +451,13 @@ class WindFarms(object):
|
|
self.params_valid([self.batch_no, self.field_code, self.save_path, self.trans_param.read_type,
|
|
self.params_valid([self.batch_no, self.field_code, self.save_path, self.trans_param.read_type,
|
|
self.trans_param.read_path, self.wind_full_name])
|
|
self.trans_param.read_path, self.wind_full_name])
|
|
|
|
|
|
- if self.trans_param.resolve_col_prefix:
|
|
|
|
- column = "测试"
|
|
|
|
- eval(self.trans_param.resolve_col_prefix)
|
|
|
|
-
|
|
|
|
- if self.trans_param.wind_name_exec:
|
|
|
|
- wind_name = "测试"
|
|
|
|
- eval(self.trans_param.wind_name_exec)
|
|
|
|
|
|
+ # if self.trans_param.resolve_col_prefix:
|
|
|
|
+ # column = "测试"
|
|
|
|
+ # eval(self.trans_param.resolve_col_prefix)
|
|
|
|
+ #
|
|
|
|
+ # if self.trans_param.wind_name_exec:
|
|
|
|
+ # wind_name = "测试"
|
|
|
|
+ # eval(self.trans_param.wind_name_exec)
|
|
|
|
|
|
trans_print("初始化字段结束,耗时:", str(datetime.datetime.now() - tmp_begin), ",总耗时:",
|
|
trans_print("初始化字段结束,耗时:", str(datetime.datetime.now() - tmp_begin), ",总耗时:",
|
|
str(datetime.datetime.now() - begin))
|
|
str(datetime.datetime.now() - begin))
|