Browse Source

优化项目

wzl 1 month ago
parent
commit
038509a11a
45 changed files with 1720 additions and 1613 deletions
  1. 9 9
      app_run.py
  2. 97 0
      conf/constants.py
  3. 10 10
      conf/etl_config_dev.yaml
  4. 3 3
      etl/common/ArchiveFile.py
  5. 39 27
      etl/common/BaseDataTrans.py
  6. 3 3
      etl/common/ClearData.py
  7. 117 44
      etl/common/CombineAndSaveFormalFile.py
  8. 96 20
      etl/common/PathsAndTable.py
  9. 47 22
      etl/common/SaveToDb.py
  10. 43 18
      etl/common/UnzipAndRemove.py
  11. 3 3
      etl/wind_power/fault_warn/FaultWarnTrans.py
  12. 4 4
      etl/wind_power/laser/LaserTrans.py
  13. 6 6
      etl/wind_power/min_sec/ClassIdentifier.py
  14. 51 26
      etl/wind_power/min_sec/MinSecTrans.py
  15. 111 45
      etl/wind_power/min_sec/ReadAndSaveTmp.py
  16. 53 40
      etl/wind_power/min_sec/StatisticsAndSaveTmpFormalFile.py
  17. 42 7
      etl/wind_power/min_sec/TransParam.py
  18. 53 18
      etl/wind_power/wave/WaveTrans.py
  19. 3 3
      service/common_connect.py
  20. 6 2
      service/trans_conf_service.py
  21. 95 72
      service/trans_service.py
  22. 4 2
      utils/common.py
  23. 137 12
      utils/conf/read_conf.py
  24. 231 41
      utils/db/ConnectMysql.py
  25. 6 6
      utils/db/ConnectMysql_tidb_fix.py
  26. 2 3
      utils/df_utils/util.py
  27. 139 38
      utils/file/trans_methods.py
  28. 0 202
      utils/file/trans_methods.py_1
  29. 99 25
      utils/log/trans_log.py
  30. 113 26
      utils/systeminfo/sysinfo.py
  31. 0 0
      utils/tmp_util/__init__.py
  32. 0 37
      utils/tmp_util/合并文件.py
  33. 0 100
      utils/tmp_util/整理INSERT到批量INSERT.py
  34. 0 87
      utils/tmp_util/神木_完整度_10分.py
  35. 0 90
      utils/tmp_util/神木_完整度_1分.py
  36. 0 18
      utils/tmp_util/获取台账所有wind表信息.py
  37. 0 76
      utils/tmp_util/表添加列.py
  38. 0 49
      utils/tmp_util/表添加注释.py
  39. 0 27
      utils/tmp_util/颗粒度变大.py
  40. 98 53
      utils/zip/unzip.py
  41. 0 0
      wind_farm/CGN/__init__.py
  42. 0 83
      wind_farm/CGN/minute_data.py
  43. 0 83
      wind_farm/CGN/purge_history_data.py
  44. 0 173
      wind_farm/CGN/second_data.py
  45. 0 0
      wind_farm/__init__.py

+ 9 - 9
app_run.py

@@ -1,8 +1,8 @@
 # -*- coding: utf-8 -*-
 # -*- coding: utf-8 -*-
 # @Time    : 2024/6/11
 # @Time    : 2024/6/11
 # @Author  : 魏志亮
 # @Author  : 魏志亮
+import os
 import sys
 import sys
-from os import *
 
 
 from utils.conf.read_conf import yaml_conf, read_conf
 from utils.conf.read_conf import yaml_conf, read_conf
 
 
@@ -11,7 +11,7 @@ def get_exec_data(run_count=1):
     now_run_count = get_now_running_count()
     now_run_count = get_now_running_count()
     data = None
     data = None
     if now_run_count >= run_count:
     if now_run_count >= run_count:
-        trans_print(f"当前有{now_run_count}个任务在执行")
+        info(f"当前有{now_run_count}个任务在执行")
     else:
     else:
         data = get_batch_exec_data()
         data = get_batch_exec_data()
     return data
     return data
@@ -22,7 +22,7 @@ def run(save_db=True, run_count=1, yaml_config=None, step=0, end=999):
     data = get_exec_data(run_count)
     data = get_exec_data(run_count)
 
 
     if data is None:
     if data is None:
-        trans_print("没有需要执行的任务")
+        info("没有需要执行的任务")
         return
         return
 
 
     exec_process = None
     exec_process = None
@@ -55,14 +55,14 @@ if __name__ == "__main__":
     if env.endswith(".yaml"):
     if env.endswith(".yaml"):
         conf_path = env
         conf_path = env
     else:
     else:
-        conf_path = path.abspath(f"./conf/etl_config_{env}.yaml")
+        conf_path = os.path.abspath(f"./conf/etl_config_{env}.yaml")
 
 
-    environ["ETL_CONF"] = conf_path
+    os.environ["ETL_CONF"] = conf_path
     yaml_config = yaml_conf(conf_path)
     yaml_config = yaml_conf(conf_path)
-    environ["env"] = env
+    os.environ["env"] = env
     run_count = int(read_conf(yaml_config, "run_batch_count", 1))
     run_count = int(read_conf(yaml_config, "run_batch_count", 1))
 
 
-    from utils.log.trans_log import trans_print
+    from utils.log.trans_log import info
     from service.trans_conf_service import (
     from service.trans_conf_service import (
         update_timeout_trans_data,
         update_timeout_trans_data,
         get_now_running_count,
         get_now_running_count,
@@ -73,7 +73,7 @@ if __name__ == "__main__":
     from etl.wind_power.laser.LaserTrans import LaserTrans
     from etl.wind_power.laser.LaserTrans import LaserTrans
     from etl.wind_power.wave.WaveTrans import WaveTrans
     from etl.wind_power.wave.WaveTrans import WaveTrans
 
 
-    trans_print("所有请求参数:", sys.argv, "env:", env, "最大可执行个数:", run_count)
-    trans_print("配置文件路径:", environ.get("ETL_CONF"))
+    info("所有请求参数:", sys.argv, "env:", env, "最大可执行个数:", run_count)
+    info("配置文件路径:", os.environ.get("ETL_CONF"))
 
 
     run(run_count=run_count, yaml_config=yaml_config, step=0)
     run(run_count=run_count, yaml_config=yaml_config, step=0)

+ 97 - 0
conf/constants.py

@@ -0,0 +1,97 @@
+# -*- coding: utf-8 -*-
+# @Time    : 2026/3/12
+# @Author  : 系统自动生成
+# 项目常量定义
+
+
+# 文件类型常量
+class FileTypes:
+    """文件类型常量"""
+    # Excel相关文件类型
+    EXCEL_TYPES = ['xls', 'xlsx', 'xlsm', 'xlsb', 'odf', 'ods', 'csv', 'csv.gz']
+    # 压缩文件类型
+    ZIP_TYPES = ['rar', 'zip']
+
+
+# 数据处理常量
+class DataProcessing:
+    """数据处理常量"""
+    # 时间戳列名
+    TIME_STAMP_COLUMN = 'time_stamp'
+    # NaN替换值
+    NAN_REPLACE_VALUE = -999999999
+    # 有功功率单位判断阈值
+    POWER_UNIT_THRESHOLD = 100000
+    # 时间间隔
+    TIME_INTERVAL = '10T'
+    # 非数值列
+    NOT_DOUBLE_COLS = ['wind_turbine_number', 'wind_turbine_name', 'time_stamp', 
+                       'param6', 'param7', 'param8', 'param9', 'param10']
+
+
+# 并行处理常量
+class ParallelProcessing:
+    """并行处理常量"""
+    # 最大进程数
+    MAX_PROCESSES = 8
+    # 最大批次数
+    MAX_BATCHES = 10
+    # CPU使用百分比
+    CPU_USAGE_PERCENT = 2 / 3
+
+
+# 数据库常量
+class Database:
+    """数据库常量"""
+    # 表引擎
+    TABLE_ENGINE = 'InnoDB'
+    # 默认字符集
+    DEFAULT_CHARSET = 'utf8mb4'
+    # 批处理大小
+    BATCH_SIZE = 100000
+
+
+# 日志常量
+class Log:
+    """日志常量"""
+    # 默认日志路径
+    DEFAULT_LOG_PATH = "/data/logs"
+    # 日志文件名前缀
+    LOG_FILE_PREFIX = "etl_tools_"
+
+
+# 路径常量
+class Paths:
+    """路径常量"""
+    # 临时文件基础路径
+    DEFAULT_TMP_BASE_PATH = "/tmp"
+    # 归档路径
+    DEFAULT_ARCHIVE_PATH = "/tmp/archive"
+
+
+# 状态常量
+class Status:
+    """状态常量"""
+    # 成功状态
+    SUCCESS = 1
+    # 错误状态
+    ERROR = 0
+    # 运行状态
+    RUNNING = 2
+
+
+# 类型常量
+class Types:
+    """类型常量"""
+    # 秒级数据
+    SECOND = 'second'
+    # 分钟级数据
+    MINUTE = 'minute'
+    # 故障数据
+    FAULT = 'fault'
+    # 告警数据
+    WARN = 'warn'
+    # 波形数据
+    WAVE = 'wave'
+    # 激光数据
+    LASER = 'laser'

+ 10 - 10
conf/etl_config_dev.yaml

@@ -1,24 +1,24 @@
 plt:
 plt:
-  database: energy_ty
+  database: energy
   host: 192.168.50.233
   host: 192.168.50.233
   password: admin123456
   password: admin123456
   port: 3306
   port: 3306
   user: admin
   user: admin
 
 
-# trans:
-#   database: energy_data
-#   host: 192.168.50.235
-#   password: admin123456
-#   port: 30306
-#   user: root
-
 trans:
 trans:
   database: energy_data
   database: energy_data
-  host: 106.120.102.238
+  host: 192.168.50.235
   password: admin123456
   password: admin123456
-  port: 10336
+  port: 30306
   user: root
   user: root
 
 
+#trans:
+#  database: energy_data
+#  host: 106.120.102.238
+#  password: admin123456
+#  port: 10336
+#  user: root
+
 # 如果要放在原始路径,则配置这个 以下面的名称作为切割点,新建清理数据文件夹
 # 如果要放在原始路径,则配置这个 以下面的名称作为切割点,新建清理数据文件夹
 etl_origin_path_contain: 收资数据
 etl_origin_path_contain: 收资数据
 # 如果单独保存,配置这个路径
 # 如果单独保存,配置这个路径

+ 3 - 3
etl/common/ArchiveFile.py

@@ -3,7 +3,7 @@ import shutil
 
 
 from etl.common.PathsAndTable import PathsAndTable
 from etl.common.PathsAndTable import PathsAndTable
 from service.trans_conf_service import update_archive_success
 from service.trans_conf_service import update_archive_success
-from utils.log.trans_log import trans_print
+from utils.log.trans_log import info
 
 
 
 
 class ArchiveFile(object):
 class ArchiveFile(object):
@@ -19,6 +19,6 @@ class ArchiveFile(object):
         if os.path.exists(self.pathsAndTable.get_tmp_formal_path()):
         if os.path.exists(self.pathsAndTable.get_tmp_formal_path()):
             shutil.make_archive(self.pathsAndTable.get_archive_path(), 'zip', self.pathsAndTable.get_tmp_formal_path())
             shutil.make_archive(self.pathsAndTable.get_archive_path(), 'zip', self.pathsAndTable.get_tmp_formal_path())
             update_archive_success(self.exec_id, f"{self.pathsAndTable.get_archive_path()}.zip")
             update_archive_success(self.exec_id, f"{self.pathsAndTable.get_archive_path()}.zip")
-            trans_print(f"文件夹已归档为 {self.pathsAndTable.get_archive_path()}.zip")
+            info(f"文件夹已归档为 {self.pathsAndTable.get_archive_path()}.zip")
         else:
         else:
-            trans_print(f"文件夹 {self.pathsAndTable.get_tmp_formal_path()} 不存在")
+            info(f"文件夹 {self.pathsAndTable.get_tmp_formal_path()} 不存在")

+ 39 - 27
etl/common/BaseDataTrans.py

@@ -10,12 +10,24 @@ from service.plt_service import get_all_wind
 from service.trans_conf_service import update_trans_status_success, update_trans_status_error, \
 from service.trans_conf_service import update_trans_status_success, update_trans_status_error, \
     update_trans_status_running
     update_trans_status_running
 from utils.file.trans_methods import read_excel_files
 from utils.file.trans_methods import read_excel_files
-from utils.log.trans_log import trans_print, set_trance_id
+from utils.log.trans_log import set_trance_id, info, error
 
 
 
 
 class BaseDataTrans(object):
 class BaseDataTrans(object):
-    def __init__(self, data: dict = None, save_db=True, yaml_config=None, step=0, end=999):
-
+    """数据转换基类"""
+
+    def __init__(self, data: dict = None, save_db: bool = True, yaml_config: dict = None, step: int = 0,
+                 end: int = 999):
+        """
+        初始化数据转换基类
+        
+        Args:
+            data: 任务数据字典
+            save_db: 是否保存到数据库
+            yaml_config: YAML配置
+            step: 开始步骤
+            end: 结束步骤
+        """
         self.id = data['id']
         self.id = data['id']
         self.task_name = data['task_name']
         self.task_name = data['task_name']
         self.transfer_type = data['transfer_type']
         self.transfer_type = data['transfer_type']
@@ -37,7 +49,7 @@ class BaseDataTrans(object):
                                                self.wind_farm_name, self.transfer_type, save_db, self.save_zip,
                                                self.wind_farm_name, self.transfer_type, save_db, self.save_zip,
                                                self.yaml_config, self.wind_col_trans)
                                                self.yaml_config, self.wind_col_trans)
         except Exception as e:
         except Exception as e:
-            trans_print(traceback.format_exc())
+            error(traceback.format_exc())
             update_trans_status_error(self.id, str(e), self.save_db)
             update_trans_status_error(self.id, str(e), self.save_db)
             raise e
             raise e
 
 
@@ -94,70 +106,70 @@ class BaseDataTrans(object):
             # 0
             # 0
             if self.step <= now_index <= self.end:
             if self.step <= now_index <= self.end:
                 begin = datetime.datetime.now()
                 begin = datetime.datetime.now()
-                trans_print("开始清理数据,临时文件夹:", self.pathsAndTable.get_tmp_path())
+                info("开始清理数据,临时文件夹:", self.pathsAndTable.get_tmp_path())
                 self.clean_file_and_db()
                 self.clean_file_and_db()
-                trans_print("清理数据结束,耗时:", datetime.datetime.now() - begin, "总耗时:",
-                            datetime.datetime.now() - total_begin)
+                info("清理数据结束,耗时:", datetime.datetime.now() - begin, "总耗时:",
+                     datetime.datetime.now() - total_begin)
 
 
             now_index = now_index + 1
             now_index = now_index + 1
             # 1
             # 1
             if self.step <= now_index <= self.end:
             if self.step <= now_index <= self.end:
                 begin = datetime.datetime.now()
                 begin = datetime.datetime.now()
-                trans_print("开始解压移动文件")
+                info("开始解压移动文件")
                 self.unzip_or_remove_to_tmp_dir()
                 self.unzip_or_remove_to_tmp_dir()
-                trans_print("解压移动文件结束:耗时:", datetime.datetime.now() - begin, "总耗时:",
-                            datetime.datetime.now() - total_begin)
+                info("解压移动文件结束:耗时:", datetime.datetime.now() - begin, "总耗时:",
+                     datetime.datetime.now() - total_begin)
 
 
             now_index = now_index + 1
             now_index = now_index + 1
             # 2
             # 2
             if self.step <= now_index <= self.end:
             if self.step <= now_index <= self.end:
                 begin = datetime.datetime.now()
                 begin = datetime.datetime.now()
-                trans_print("开始保存数据到临时文件")
+                info("开始保存数据到临时文件")
                 self.read_and_save_tmp_file()
                 self.read_and_save_tmp_file()
-                trans_print("保存数据到临时文件结束,耗时:", datetime.datetime.now() - begin, "总耗时:",
-                            datetime.datetime.now() - total_begin)
+                info("保存数据到临时文件结束,耗时:", datetime.datetime.now() - begin, "总耗时:",
+                     datetime.datetime.now() - total_begin)
 
 
             now_index = now_index + 1
             now_index = now_index + 1
             # 3
             # 3
             if self.step <= now_index <= self.end:
             if self.step <= now_index <= self.end:
                 begin = datetime.datetime.now()
                 begin = datetime.datetime.now()
-                trans_print("开始保存到临时正式文件")
+                info("开始保存到临时正式文件")
                 self.statistics_and_save_tmp_formal_file()
                 self.statistics_and_save_tmp_formal_file()
-                trans_print("保存到临时正式文件结束,耗时:", datetime.datetime.now() - begin, "总耗时:",
-                            datetime.datetime.now() - total_begin)
+                info("保存到临时正式文件结束,耗时:", datetime.datetime.now() - begin, "总耗时:",
+                     datetime.datetime.now() - total_begin)
 
 
             now_index = now_index + 1
             now_index = now_index + 1
             # 4
             # 4
             if self.step <= now_index <= self.end:
             if self.step <= now_index <= self.end:
                 begin = datetime.datetime.now()
                 begin = datetime.datetime.now()
-                trans_print("开始保存归档文件")
+                info("开始保存归档文件")
                 self.archive_file()
                 self.archive_file()
-                trans_print("保存到保存归档文件结束,耗时:", datetime.datetime.now() - begin, "总耗时:",
-                            datetime.datetime.now() - total_begin)
+                info("保存到保存归档文件结束,耗时:", datetime.datetime.now() - begin, "总耗时:",
+                     datetime.datetime.now() - total_begin)
 
 
             now_index = now_index + 1
             now_index = now_index + 1
             # 5
             # 5
             if self.step <= now_index <= self.end:
             if self.step <= now_index <= self.end:
                 begin = datetime.datetime.now()
                 begin = datetime.datetime.now()
-                trans_print("开始保存数据到正式文件")
+                info("开始保存数据到正式文件")
                 self.combine_and_save_formal_file()
                 self.combine_and_save_formal_file()
-                trans_print("保存数据到正式文件结束,耗时:", datetime.datetime.now() - begin, "总耗时:",
-                            datetime.datetime.now() - total_begin)
+                info("保存数据到正式文件结束,耗时:", datetime.datetime.now() - begin, "总耗时:",
+                     datetime.datetime.now() - total_begin)
 
 
             now_index = now_index + 1
             now_index = now_index + 1
             # 6
             # 6
             if self.step <= now_index <= self.end:
             if self.step <= now_index <= self.end:
                 begin = datetime.datetime.now()
                 begin = datetime.datetime.now()
-                trans_print("开始保存到数据库,是否存库:", self.pathsAndTable.save_db)
+                info("开始保存到数据库,是否存库:", self.pathsAndTable.save_db)
                 self.save_to_db()
                 self.save_to_db()
-                trans_print("保存到数据结束,耗时:", datetime.datetime.now() - begin, "总耗时:",
-                            datetime.datetime.now() - total_begin)
+                info("保存到数据结束,耗时:", datetime.datetime.now() - begin, "总耗时:",
+                     datetime.datetime.now() - total_begin)
 
 
             self.update_exec_progress()
             self.update_exec_progress()
         except Exception as e:
         except Exception as e:
-            trans_print(traceback.format_exc())
+            error(traceback.format_exc())
             update_trans_status_error(self.id, str(e), self.save_db)
             update_trans_status_error(self.id, str(e), self.save_db)
             raise e
             raise e
         finally:
         finally:
             self.pathsAndTable.delete_tmp_files()
             self.pathsAndTable.delete_tmp_files()
-            trans_print("执行结束,总耗时:", str(datetime.datetime.now() - total_begin))
+            info("执行结束,总耗时:", str(datetime.datetime.now() - total_begin))

+ 3 - 3
etl/common/ClearData.py

@@ -2,7 +2,7 @@ import datetime
 
 
 from etl.common.PathsAndTable import PathsAndTable
 from etl.common.PathsAndTable import PathsAndTable
 from service.trans_conf_service import update_trans_transfer_progress
 from service.trans_conf_service import update_trans_transfer_progress
-from utils.log.trans_log import trans_print
+from utils.log.trans_log import info
 
 
 
 
 class ClearData(object):
 class ClearData(object):
@@ -19,8 +19,8 @@ class ClearData(object):
         # self.pathsAndTable.delete_batch_files()
         # self.pathsAndTable.delete_batch_files()
 
 
     def run(self):
     def run(self):
-        trans_print("开始清理数据,临时文件夹:", self.pathsAndTable.get_tmp_path())
+        info("开始清理数据,临时文件夹:", self.pathsAndTable.get_tmp_path())
         begin = datetime.datetime.now()
         begin = datetime.datetime.now()
         self.clean_data()
         self.clean_data()
         update_trans_transfer_progress(self.pathsAndTable.id, 5, self.pathsAndTable.save_db)
         update_trans_transfer_progress(self.pathsAndTable.id, 5, self.pathsAndTable.save_db)
-        trans_print("清理数据结束,耗时:", datetime.datetime.now() - begin)
+        info("清理数据结束,耗时:", datetime.datetime.now() - begin)

+ 117 - 44
etl/common/CombineAndSaveFormalFile.py

@@ -1,61 +1,134 @@
 import multiprocessing
 import multiprocessing
 import os
 import os
+from typing import Dict, List, Tuple, Optional
 
 
 import pandas as pd
 import pandas as pd
 
 
+from conf.constants import DataProcessing, ParallelProcessing
 from etl.common.PathsAndTable import PathsAndTable
 from etl.common.PathsAndTable import PathsAndTable
 from utils.file.trans_methods import read_excel_files, read_file_to_df, copy_to_new
 from utils.file.trans_methods import read_excel_files, read_file_to_df, copy_to_new
-from utils.log.trans_log import trans_print
+from utils.log.trans_log import info, debug
 from utils.systeminfo.sysinfo import get_available_cpu_count_with_percent
 from utils.systeminfo.sysinfo import get_available_cpu_count_with_percent
 
 
 
 
-class CombineAndSaveFormalFile(object):
+class CombineAndSaveFormalFile:
+    """合并并保存正式文件"""
 
 
-    def __init__(self, pathsAndTable: PathsAndTable):
-        self.pathsAndTable = pathsAndTable
-        self.update_files = multiprocessing.Manager().list()
+    # 常量定义
+    TIME_STAMP_COLUMN = DataProcessing.TIME_STAMP_COLUMN
 
 
-    def combine_and_save(self, file_path, key, exists_file_path):
-        exists_same = False
-        if exists_file_path:
-            exists_same = True
+    def __init__(self, paths_and_table: PathsAndTable):
+        """
+        初始化合并器
+
+        Args:
+            paths_and_table: 路径和表信息对象
+        """
+        self.paths_and_table = paths_and_table
+        self.updated_files = multiprocessing.Manager().list()
+
+    def _merge_dataframes(self, exists_df: pd.DataFrame, now_df: pd.DataFrame) -> pd.DataFrame:
+        """
+        合并两个数据框并去重排序
+
+        Args:
+            exists_df: 已存在的数据框
+            now_df: 当前的数据框
+
+        Returns:
+            合并后的数据框
+        """
+        combined_df = pd.concat([exists_df, now_df])
+        # 去重,保留最新的数据
+        combined_df = combined_df.drop_duplicates(
+            subset=self.TIME_STAMP_COLUMN,
+            keep='last'
+        )
+        # 按时间戳排序
+        return combined_df.sort_values(
+            by=self.TIME_STAMP_COLUMN
+        ).reset_index(drop=True)
+
+    def _save_combined_file(self, file_path: str, key: Tuple[str, str], exists_file_path: Optional[str]) -> None:
+        """
+        保存合并后的文件
+
+        Args:
+            file_path: 新文件路径
+            key: 文件键值 (目录名, 文件名)
+            exists_file_path: 已存在的文件路径,如果为None则表示不存在
+        """
+        has_exists = exists_file_path is not None
+
+        if has_exists:
+            # 合并并保存
             exists_df = read_file_to_df(exists_file_path)
             exists_df = read_file_to_df(exists_file_path)
             now_df = read_file_to_df(file_path)
             now_df = read_file_to_df(file_path)
-            # 合并两个 DataFrame
-            combined_df = pd.concat([exists_df, now_df])
-            # 去重,保留 now_df 的值
-            combined_df = combined_df.drop_duplicates(subset='time_stamp', keep='last')
-            # 按 time_stamp 排序
-            combined_df = combined_df.sort_values(by='time_stamp').reset_index(drop=True)
+            combined_df = self._merge_dataframes(exists_df, now_df)
             combined_df.to_csv(exists_file_path, encoding='utf-8', index=False)
             combined_df.to_csv(exists_file_path, encoding='utf-8', index=False)
-            self.update_files.append(exists_file_path)
+            self.updated_files.append(exists_file_path)
         else:
         else:
-            save_path = str(os.path.join(self.pathsAndTable.get_save_path(), key[0], key[1]))
-            copy_to_new(file_path, save_path)
-            self.update_files.append(save_path)
-        trans_print(f"{key[0]}/{key[1]} {'包含' if exists_same else '不包含'} 相同文件,保存成功")
-
-    def combine_and_save_formal_file(self):
-        exists_files = read_excel_files(self.pathsAndTable.get_save_path())
-        exists_file_maps = dict()
-        for file_path in exists_files:
-            name = (os.path.basename(os.path.dirname(file_path)), os.path.basename(file_path))
-            exists_file_maps[name] = file_path
-
-        new_files = read_excel_files(self.pathsAndTable.get_tmp_formal_path())
-        new_file_maps = dict()
-        for file_path in new_files:
-            name = (os.path.basename(os.path.dirname(file_path)), os.path.basename(file_path))
-            new_file_maps[name] = file_path
-
-        same_keys = list(set(exists_file_maps.keys()).intersection(new_file_maps.keys()))
-        split_count = get_available_cpu_count_with_percent(2 / 3)
-        with multiprocessing.Pool(split_count) as pool:
-            pool.starmap(self.combine_and_save,
-                         [(file_path, key, exists_file_maps[key] if key in same_keys else None) for key, file_path in
-                          new_file_maps.items()])
-
-    def run(self):
+            # 复制新文件
+            save_dir = str(os.path.join(
+                self.paths_and_table.get_save_path(),
+                key[0],
+                key[1]
+            ))
+            copy_to_new(file_path, save_dir)
+            self.updated_files.append(save_dir)
+
+        # 记录日志
+        status = "包含" if has_exists else "不包含"
+        debug(f"{key[0]}/{key[1]} {status} 相同文件,保存成功")
+
+    def _build_file_maps(self, base_path: str) -> Dict[Tuple[str, str], str]:
+        """
+        构建文件映射字典
+
+        Args:
+            base_path: 基础路径
+
+        Returns:
+            文件路径映射字典,键为(目录名, 文件名),值为完整路径
+        """
+        files = read_excel_files(base_path)
+        return {
+            (os.path.basename(os.path.dirname(file_path)), os.path.basename(file_path)): file_path
+            for file_path in files
+        }
+
+    def combine_and_save_formal_file(self) -> None:
+        """合并并保存正式文件的主方法"""
+        # 构建已存在文件和新文件的映射
+        exists_file_maps = self._build_file_maps(self.paths_and_table.get_save_path())
+        new_file_maps = self._build_file_maps(self.paths_and_table.get_tmp_formal_path())
+
+        # 找出相同键的文件
+        same_keys = set(exists_file_maps.keys()) & set(new_file_maps.keys())
+
+        # 准备并行处理参数
+        process_args = [
+            (
+                file_path,
+                key,
+                exists_file_maps.get(key) if key in same_keys else None
+            )
+            for key, file_path in new_file_maps.items()
+        ]
+
+        # 使用并行处理
+        cpu_count = get_available_cpu_count_with_percent(ParallelProcessing.CPU_USAGE_PERCENT)
+        cpu_count = min(cpu_count, ParallelProcessing.MAX_PROCESSES)
+        with multiprocessing.Pool(cpu_count) as pool:
+            pool.starmap(self._save_combined_file, process_args)
+
+    def run(self) -> List[str]:
+        """
+        执行合并操作
+
+        Returns:
+            更新后的文件路径列表
+        """
         self.combine_and_save_formal_file()
         self.combine_and_save_formal_file()
-        print(self.update_files)
-        return list(self.update_files)
+        info(f"共处理了 {len(self.updated_files)} 个文件")
+        return list(self.updated_files)

+ 96 - 20
etl/common/PathsAndTable.py

@@ -1,14 +1,33 @@
 import shutil
 import shutil
 from os import path, sep
 from os import path, sep
 
 
+from conf.constants import Paths
 from service.trans_service import creat_min_sec_table, create_warn_fault_table
 from service.trans_service import creat_min_sec_table, create_warn_fault_table
 from utils.conf.read_conf import *
 from utils.conf.read_conf import *
-from utils.log.trans_log import trans_print
+from utils.log.trans_log import info
 
 
 
 
 class PathsAndTable(object):
 class PathsAndTable(object):
-    def __init__(self, id=None, task_name=None, read_dir=None, wind_farm_code=None, wind_farm_name=None,
-                 read_type=None, save_db=True, save_zip=True, yaml_config=None, wind_col_trans=None):
+    """路径和表管理类"""
+
+    def __init__(self, id: int = None, task_name: str = None, read_dir: str = None, wind_farm_code: str = None,
+                 wind_farm_name: str = None, read_type: str = None, save_db: bool = True,
+                 save_zip: bool = True, yaml_config: dict = None, wind_col_trans: dict = None):
+        """
+        初始化路径和表管理类
+        
+        Args:
+            id: 任务ID
+            task_name: 任务名称
+            read_dir: 读取目录
+            wind_farm_code: 风电场编码
+            wind_farm_name: 风电场名称
+            read_type: 读取类型
+            save_db: 是否保存到数据库
+            save_zip: 是否保存为压缩文件
+            yaml_config: YAML配置
+            wind_col_trans: 风机列转换映射
+        """
         self.id = id
         self.id = id
         self.task_name = task_name
         self.task_name = task_name
         self.read_dir = read_dir
         self.read_dir = read_dir
@@ -25,11 +44,11 @@ class PathsAndTable(object):
 
 
         self.use_tidb = read_conf(yaml_config, 'use_tidb', False)
         self.use_tidb = read_conf(yaml_config, 'use_tidb', False)
 
 
-        self.tmp_base_path = read_conf(yaml_config, "tmp_base_path", "/tmp")
+        self.tmp_base_path = read_conf(yaml_config, "tmp_base_path", Paths.DEFAULT_TMP_BASE_PATH)
         if save_path_conf:
         if save_path_conf:
             self.save_path = save_path_conf + sep + self.wind_farm_name
             self.save_path = save_path_conf + sep + self.wind_farm_name
         else:
         else:
-            find_index = read_dir.find(read_conf(yaml_config, 'etl_origin_path_contain', "etl_origin_path_contain"))
+            find_index = read_dir.find(read_conf(yaml_config, 'etl_origin_path_contain', "收资数据"))
             if find_index == -1:
             if find_index == -1:
                 raise Exception("路径未包含原始数据特定字符:" + read_dir)
                 raise Exception("路径未包含原始数据特定字符:" + read_dir)
             self.save_path = read_dir[0:find_index] + sep + "清理数据"
             self.save_path = read_dir[0:find_index] + sep + "清理数据"
@@ -37,48 +56,105 @@ class PathsAndTable(object):
         if self.save_path is None:
         if self.save_path is None:
             raise Exception("未配置保存路径:" + read_dir)
             raise Exception("未配置保存路径:" + read_dir)
 
 
-        self.archive_path = read_conf(yaml_config, "archive_path", "/tmp/archive")
+        self.archive_path = read_conf(yaml_config, "archive_path", Paths.DEFAULT_ARCHIVE_PATH)
 
 
-    def get_save_path(self):
+    def get_save_path(self) -> str:
+        """
+        获取保存路径
+        
+        Returns:
+            保存路径
+        """
         return path.join(self.save_path, self.read_type)
         return path.join(self.save_path, self.read_type)
 
 
-    def get_tmp_path(self):
+    def get_tmp_path(self) -> str:
+        """
+        获取临时路径
+        
+        Returns:
+            临时路径
+        """
         return str(path.join(self.tmp_base_path, str(self.id) + "_" + self.task_name + "_" + self.read_type))
         return str(path.join(self.tmp_base_path, str(self.id) + "_" + self.task_name + "_" + self.read_type))
 
 
-    def get_excel_tmp_path(self):
+    def get_excel_tmp_path(self) -> str:
+        """
+        获取Excel临时路径
+        
+        Returns:
+            Excel临时路径
+        """
         return path.join(self.get_tmp_path(), 'excel_tmp' + sep)
         return path.join(self.get_tmp_path(), 'excel_tmp' + sep)
 
 
-    def get_read_tmp_path(self):
+    def get_read_tmp_path(self) -> str:
+        """
+        获取读取临时路径
+        
+        Returns:
+            读取临时路径
+        """
         return path.join(self.get_tmp_path(), 'read_tmp')
         return path.join(self.get_tmp_path(), 'read_tmp')
 
 
-    def get_merge_tmp_path(self, wind_turbine_number=None):
+    def get_merge_tmp_path(self, wind_turbine_number=None) -> str:
+        """
+        获取合并临时路径
+        
+        Args:
+            wind_turbine_number: 风机编号
+            
+        Returns:
+            合并临时路径
+        """
         if wind_turbine_number is None:
         if wind_turbine_number is None:
             return path.join(self.get_tmp_path(), 'merge_tmp')
             return path.join(self.get_tmp_path(), 'merge_tmp')
         else:
         else:
             return path.join(self.get_tmp_path(), 'merge_tmp', str(wind_turbine_number))
             return path.join(self.get_tmp_path(), 'merge_tmp', str(wind_turbine_number))
 
 
-    def get_tmp_formal_path(self):
+    def get_tmp_formal_path(self) -> str:
+        """
+        获取正式临时路径
+        
+        Returns:
+            正式临时路径
+        """
         return path.join(self.get_tmp_path(), 'formal_tmp')
         return path.join(self.get_tmp_path(), 'formal_tmp')
 
 
-    def get_archive_path(self):
+    def get_archive_path(self) -> str:
+        """
+        获取归档路径
+        
+        Returns:
+            归档路径
+        """
         return path.join(self.archive_path, self.wind_farm_name, self.read_type, f'{self.id}_{self.task_name}')
         return path.join(self.archive_path, self.wind_farm_name, self.read_type, f'{self.id}_{self.task_name}')
 
 
-    def get_table_name(self):
+    def get_table_name(self) -> str:
+        """
+        获取表名
+        
+        Returns:
+            表名
+        """
         return "_".join([self.wind_farm_code, self.read_type])
         return "_".join([self.wind_farm_code, self.read_type])
 
 
-    def delete_tmp_files(self):
-        trans_print("开始删除临时文件夹")
+    def delete_tmp_files(self) -> None:
+        """
+        删除临时文件
+        """
+        info("开始删除临时文件夹")
         if path.exists(self.get_tmp_path()):
         if path.exists(self.get_tmp_path()):
             shutil.rmtree(self.get_tmp_path())
             shutil.rmtree(self.get_tmp_path())
-        trans_print("删除临时文件夹删除成功")
+        info("删除临时文件夹删除成功")
 
 
-    def create_wind_farm_db(self):
+    def create_wind_farm_db(self) -> None:
+        """
+        创建风电场数据库表
+        """
         if self.save_db:
         if self.save_db:
-            trans_print("开始创建表")
+            info("开始创建表")
             if self.read_type in ['second', 'minute']:
             if self.read_type in ['second', 'minute']:
                 creat_min_sec_table(self.get_table_name(), self.read_type, self.wind_farm_name, self.use_tidb)
                 creat_min_sec_table(self.get_table_name(), self.read_type, self.wind_farm_name, self.use_tidb)
             elif self.read_type in ['fault', 'warn']:
             elif self.read_type in ['fault', 'warn']:
                 create_warn_fault_table(self.get_table_name(), self.wind_farm_name, )
                 create_warn_fault_table(self.get_table_name(), self.wind_farm_name, )
             else:
             else:
                 raise Exception("不支持的读取类型:" + self.read_type)
                 raise Exception("不支持的读取类型:" + self.read_type)
-            trans_print("建表结束")
+            info("建表结束")

+ 47 - 22
etl/common/SaveToDb.py

@@ -5,8 +5,7 @@ import traceback
 from etl.common.PathsAndTable import PathsAndTable
 from etl.common.PathsAndTable import PathsAndTable
 from service.trans_conf_service import update_trans_transfer_progress
 from service.trans_conf_service import update_trans_transfer_progress
 from service.trans_service import save_scada_file_to_db, save_file_to_db
 from service.trans_service import save_scada_file_to_db, save_file_to_db
-from utils.file.trans_methods import split_array
-from utils.log.trans_log import trans_print
+from utils.log.trans_log import info, error
 from utils.systeminfo.sysinfo import get_available_cpu_count_with_percent
 from utils.systeminfo.sysinfo import get_available_cpu_count_with_percent
 
 
 
 
@@ -25,34 +24,60 @@ class SaveToDb(object):
         all_saved_files = [i for i in all_saved_files if
         all_saved_files = [i for i in all_saved_files if
                            os.path.basename(i).split(".")[0] in self.pathsAndTable.wind_col_trans.keys()]
                            os.path.basename(i).split(".")[0] in self.pathsAndTable.wind_col_trans.keys()]
 
 
+        if not all_saved_files:
+            info("没有文件需要保存到数据库")
+            return
+
         self.pathsAndTable.create_wind_farm_db()
         self.pathsAndTable.create_wind_farm_db()
 
 
-        split_count = get_available_cpu_count_with_percent(percent=2 / 3)
-        split_count = split_count if split_count <= len(all_saved_files) else len(all_saved_files)
-        all_arrays = split_array(all_saved_files, split_count)
+        # 计算最佳进程数
+        max_processes = get_available_cpu_count_with_percent(percent=2 / 3)
+        max_processes = min(max_processes, len(all_saved_files), 10)  # 限制最大进程数为10
+
         try:
         try:
-            for index, arr in enumerate(all_arrays):
-                with multiprocessing.Pool(10) as pool:
-                    if self.pathsAndTable.read_type in ['minute', 'second']:
-                        pool.starmap(save_scada_file_to_db,
-                                     [(self.pathsAndTable.get_table_name(), file,
-                                       self.pathsAndTable.wind_col_trans[os.path.basename(file).split(".")[0]],
-                                       os.path.basename(os.path.dirname(file)),
-                                       self.batch_count,self.pathsAndTable.use_tidb) for file in arr])
-                    else:
-                        pool.starmap(save_file_to_db,
-                                     [(self.pathsAndTable.get_table_name(), file, self.batch_count) for file in arr])
-
-                update_trans_transfer_progress(self.pathsAndTable.id,
-                                               round(70 + 29 * (index + 1) / len(all_arrays), 2),
-                                               self.pathsAndTable.save_db)
+            # 创建一个进程池处理所有文件
+            with multiprocessing.Pool(max_processes) as pool:
+                if self.pathsAndTable.read_type in ['minute', 'second']:
+                    # 准备参数
+                    params = [(self.pathsAndTable.get_table_name(), file,
+                               self.pathsAndTable.wind_col_trans[os.path.basename(file).split(".")[0]],
+                               os.path.basename(os.path.dirname(file)),
+                               self.batch_count, self.pathsAndTable.use_tidb) for file in all_saved_files]
+
+                    # 分批次处理并更新进度
+                    batch_size = max(1, len(params) // 10)  # 最多10个批次
+                    for i in range(0, len(params), batch_size):
+                        batch_params = params[i:i + batch_size]
+                        pool.starmap(save_scada_file_to_db, batch_params)
+
+                        # 更新进度
+                        progress = 70 + 29 * (i + len(batch_params)) / len(params)
+                        update_trans_transfer_progress(self.pathsAndTable.id,
+                                                       round(progress, 2),
+                                                       self.pathsAndTable.save_db)
+
+                else:
+                    # 准备参数
+                    params = [(self.pathsAndTable.get_table_name(), file, self.batch_count) for file in all_saved_files]
+
+                    # 分批次处理并更新进度
+                    batch_size = max(1, len(params) // 10)  # 最多10个批次
+                    for i in range(0, len(params), batch_size):
+                        batch_params = params[i:i + batch_size]
+                        pool.starmap(save_file_to_db, batch_params)
+
+                        # 更新进度
+                        progress = 70 + 29 * (i + len(batch_params)) / len(params)
+                        update_trans_transfer_progress(self.pathsAndTable.id,
+                                                       round(progress, 2),
+                                                       self.pathsAndTable.save_db)
         except Exception as e:
         except Exception as e:
-            trans_print(traceback.format_exc())
+            error(traceback.format_exc())
             message = "保存到数据库错误,系统返回错误:" + str(e)
             message = "保存到数据库错误,系统返回错误:" + str(e)
             raise ValueError(message)
             raise ValueError(message)
 
 
     def run(self):
     def run(self):
         if self.pathsAndTable.save_db:
         if self.pathsAndTable.save_db:
             self.mutiprocessing_to_save_db()
             self.mutiprocessing_to_save_db()
-            update_trans_transfer_progress(self.pathsAndTable.id,  99,
+            update_trans_transfer_progress(self.pathsAndTable.id, 99,
                                            self.pathsAndTable.save_db)
                                            self.pathsAndTable.save_db)

+ 43 - 18
etl/common/UnzipAndRemove.py

@@ -1,54 +1,76 @@
 import multiprocessing
 import multiprocessing
+import os
 import traceback
 import traceback
-from os import *
+from typing import List, Optional
 
 
+from conf.constants import ParallelProcessing
 from etl.common.PathsAndTable import PathsAndTable
 from etl.common.PathsAndTable import PathsAndTable
 from service.trans_conf_service import update_trans_transfer_progress
 from service.trans_conf_service import update_trans_transfer_progress
 from utils.file.trans_methods import read_files, read_excel_files, copy_to_new, split_array
 from utils.file.trans_methods import read_files, read_excel_files, copy_to_new, split_array
-from utils.log.trans_log import trans_print
+from utils.log.trans_log import info, error
 from utils.systeminfo.sysinfo import get_available_cpu_count_with_percent
 from utils.systeminfo.sysinfo import get_available_cpu_count_with_percent
 from utils.zip.unzip import unzip, unrar, get_desc_path
 from utils.zip.unzip import unzip, unrar, get_desc_path
 
 
 
 
 class UnzipAndRemove(object):
 class UnzipAndRemove(object):
-    def __init__(self, pathsAndTable: PathsAndTable, filter_types=None):
+    """解压缩并移动文件类"""
+
+    def __init__(self, pathsAndTable: PathsAndTable, filter_types: Optional[List[str]] = None):
+        """
+        初始化解压缩并移动文件类
+        
+        Args:
+            pathsAndTable: 路径和表对象
+            filter_types: 文件类型过滤器
+        """
         self.pathsAndTable = pathsAndTable
         self.pathsAndTable = pathsAndTable
         self.filter_types = filter_types
         self.filter_types = filter_types
 
 
-    def get_and_remove(self, file):
-
+    def get_and_remove(self, file: str) -> None:
+        """
+        解压缩或移动文件到临时路径
+        
+        Args:
+            file: 文件路径
+        """
         to_path = self.pathsAndTable.get_excel_tmp_path()
         to_path = self.pathsAndTable.get_excel_tmp_path()
-        if str(file).endswith("zip"):
-            if str(file).endswith("csv.zip"):
+        file_lower = str(file).lower()
+        if file_lower.endswith("zip"):
+            if file_lower.endswith("csv.zip"):
                 copy_to_new(file, file.replace(self.pathsAndTable.read_dir, to_path).replace("csv.zip", 'csv.gz'))
                 copy_to_new(file, file.replace(self.pathsAndTable.read_dir, to_path).replace("csv.zip", 'csv.gz'))
             else:
             else:
                 desc_path = file.replace(self.pathsAndTable.read_dir, to_path)
                 desc_path = file.replace(self.pathsAndTable.read_dir, to_path)
                 unzip(file, get_desc_path(desc_path))
                 unzip(file, get_desc_path(desc_path))
                 self.pathsAndTable.has_zip = True
                 self.pathsAndTable.has_zip = True
-        elif str(file).endswith("rar"):
+        elif file_lower.endswith("rar"):
             desc_path = file.replace(self.pathsAndTable.read_dir, to_path)
             desc_path = file.replace(self.pathsAndTable.read_dir, to_path)
             is_success, e = unrar(file, get_desc_path(desc_path))
             is_success, e = unrar(file, get_desc_path(desc_path))
             self.pathsAndTable.has_zip = True
             self.pathsAndTable.has_zip = True
-            if not is_success:
-                trans_print(traceback.format_exc())
-                pass
         else:
         else:
             copy_to_new(file, file.replace(self.pathsAndTable.read_dir, to_path))
             copy_to_new(file, file.replace(self.pathsAndTable.read_dir, to_path))
 
 
-    def remove_file_to_tmp_path(self):
+    def remove_file_to_tmp_path(self) -> List[str]:
+        """
+        将文件移动到临时路径
+        
+        Returns:
+            处理后的文件列表
+        """
         # 读取文件
         # 读取文件
         try:
         try:
-            if path.isfile(self.pathsAndTable.read_dir):
+            if os.path.isfile(self.pathsAndTable.read_dir):
                 all_files = [self.pathsAndTable.read_dir]
                 all_files = [self.pathsAndTable.read_dir]
             else:
             else:
                 all_files = read_files(self.pathsAndTable.read_dir)
                 all_files = read_files(self.pathsAndTable.read_dir)
 
 
             # 最大取系统cpu的 三分之二
             # 最大取系统cpu的 三分之二
             split_count = get_available_cpu_count_with_percent(2 / 3)
             split_count = get_available_cpu_count_with_percent(2 / 3)
+            # 限制最大进程数
+            split_count = min(split_count, ParallelProcessing.MAX_PROCESSES)
             all_arrays = split_array(all_files, split_count)
             all_arrays = split_array(all_files, split_count)
 
 
             for index, arr in enumerate(all_arrays):
             for index, arr in enumerate(all_arrays):
-                pool_count = split_count if split_count < len(arr) else len(arr)
+                pool_count = min(split_count, len(arr))
                 with multiprocessing.Pool(pool_count) as pool:
                 with multiprocessing.Pool(pool_count) as pool:
                     pool.starmap(self.get_and_remove, [(i,) for i in arr])
                     pool.starmap(self.get_and_remove, [(i,) for i in arr])
                 update_trans_transfer_progress(self.pathsAndTable.id,
                 update_trans_transfer_progress(self.pathsAndTable.id,
@@ -57,14 +79,17 @@ class UnzipAndRemove(object):
 
 
             all_files = read_excel_files(self.pathsAndTable.get_excel_tmp_path())
             all_files = read_excel_files(self.pathsAndTable.get_excel_tmp_path())
 
 
-            trans_print('读取文件数量:', len(all_files))
+            info('读取文件数量:', len(all_files))
         except Exception as e:
         except Exception as e:
-            trans_print(traceback.format_exc())
+            error(traceback.format_exc())
             message = "读取文件列表错误:" + self.pathsAndTable.read_dir + ",系统返回错误:" + str(e)
             message = "读取文件列表错误:" + self.pathsAndTable.read_dir + ",系统返回错误:" + str(e)
             raise ValueError(message)
             raise ValueError(message)
         return all_files
         return all_files
 
 
-    def run(self):
+    def run(self) -> None:
+        """
+        运行解压缩和移动文件流程
+        """
         self.remove_file_to_tmp_path()
         self.remove_file_to_tmp_path()
-        update_trans_transfer_progress(self.pathsAndTable.id,  20,
+        update_trans_transfer_progress(self.pathsAndTable.id, 20,
                                        self.pathsAndTable.save_db)
                                        self.pathsAndTable.save_db)

+ 3 - 3
etl/wind_power/fault_warn/FaultWarnTrans.py

@@ -10,7 +10,7 @@ from service.trans_service import get_fault_warn_conf, drop_table, create_warn_f
     save_file_to_db
     save_file_to_db
 from utils.conf.read_conf import read_conf
 from utils.conf.read_conf import read_conf
 from utils.file.trans_methods import read_excel_files, read_file_to_df, create_file_path, valid_eval
 from utils.file.trans_methods import read_excel_files, read_file_to_df, create_file_path, valid_eval
-from utils.log.trans_log import trans_print
+from utils.log.trans_log import info, error
 
 
 
 
 class FaultWarnTrans(BaseDataTrans):
 class FaultWarnTrans(BaseDataTrans):
@@ -27,14 +27,14 @@ class FaultWarnTrans(BaseDataTrans):
 
 
     # 第三步 读取 并 保存到临时文件
     # 第三步 读取 并 保存到临时文件
     def read_and_save_tmp_file(self):
     def read_and_save_tmp_file(self):
-        trans_print("无需保存临时文件")
+        info("无需保存临时文件")
 
 
     # 读取并保存到临时正式文件
     # 读取并保存到临时正式文件
     def statistics_and_save_tmp_formal_file(self):
     def statistics_and_save_tmp_formal_file(self):
         conf_map = self.get_filed_conf()
         conf_map = self.get_filed_conf()
         if conf_map is None or type(conf_map) == tuple or len(conf_map.keys()) == 0:
         if conf_map is None or type(conf_map) == tuple or len(conf_map.keys()) == 0:
             message = f"未找到{self.id}的{self.transfer_type}配置"
             message = f"未找到{self.id}的{self.transfer_type}配置"
-            trans_print(message)
+            error(message)
             update_trans_status_error(self.id, message, self.save_db)
             update_trans_status_error(self.id, message, self.save_db)
         else:
         else:
 
 

+ 4 - 4
etl/wind_power/laser/LaserTrans.py

@@ -7,11 +7,11 @@ import numpy as np
 import pandas as pd
 import pandas as pd
 
 
 from service.plt_service import get_all_wind
 from service.plt_service import get_all_wind
-from service.trans_service import save_df_to_db
 from service.trans_conf_service import update_trans_status_running, update_trans_transfer_progress, \
 from service.trans_conf_service import update_trans_status_running, update_trans_transfer_progress, \
     update_trans_status_success
     update_trans_status_success
+from service.trans_service import save_df_to_db
 from utils.file.trans_methods import read_files, read_file_to_df
 from utils.file.trans_methods import read_files, read_file_to_df
-from utils.log.trans_log import set_trance_id, trans_print
+from utils.log.trans_log import set_trance_id, info
 
 
 
 
 class LaserTrans():
 class LaserTrans():
@@ -56,7 +56,7 @@ class LaserTrans():
         trance_id = '-'.join([self.wind_farm_code, 'laser'])
         trance_id = '-'.join([self.wind_farm_code, 'laser'])
         set_trance_id(trance_id)
         set_trance_id(trance_id)
         all_files = read_files(self.read_path, ['csv'])
         all_files = read_files(self.read_path, ['csv'])
-        trans_print(self.wind_farm_code, '获取文件总数为:', len(all_files))
+        info(self.wind_farm_code, '获取文件总数为:', len(all_files))
         pool_count = 8 if len(all_files) > 8 else len(all_files)
         pool_count = 8 if len(all_files) > 8 else len(all_files)
 
 
         with multiprocessing.Pool(pool_count) as pool:
         with multiprocessing.Pool(pool_count) as pool:
@@ -70,7 +70,7 @@ class LaserTrans():
         update_trans_status_success(self.id, len(df['wind_turbine_number'].unique()), None,
         update_trans_status_success(self.id, len(df['wind_turbine_number'].unique()), None,
                                     df['acquisition_time'].min(), df['acquisition_time'].max(), df.shape[0])
                                     df['acquisition_time'].min(), df['acquisition_time'].max(), df.shape[0])
         # update_trans_status_success(self.id)
         # update_trans_status_success(self.id)
-        trans_print(self.wind_farm_code, '执行结束,总耗时:', (datetime.datetime.now() - self.begin))
+        info(self.wind_farm_code, '执行结束,总耗时:', (datetime.datetime.now() - self.begin))
 
 
 
 
 if __name__ == '__main__':
 if __name__ == '__main__':

+ 6 - 6
etl/wind_power/min_sec/ClassIdentifier.py

@@ -5,7 +5,7 @@ import numpy as np
 from pandas import DataFrame
 from pandas import DataFrame
 
 
 from utils.file.trans_methods import read_file_to_df
 from utils.file.trans_methods import read_file_to_df
-from utils.log.trans_log import trans_print
+from utils.log.trans_log import error, warning, debug
 
 
 
 
 class ClassIdentifier(object):
 class ClassIdentifier(object):
@@ -35,11 +35,11 @@ class ClassIdentifier(object):
         self.cut_out_speed = cut_out_speed
         self.cut_out_speed = cut_out_speed
 
 
         if self.rated_power is None:
         if self.rated_power is None:
-            trans_print(wind_turbine_number, "WARNING:rated_power配置为空的")
+            warning(wind_turbine_number, "WARNING:rated_power配置为空的")
             self.rated_power = 1500
             self.rated_power = 1500
 
 
         if self.cut_out_speed is None:
         if self.cut_out_speed is None:
-            trans_print(cut_out_speed, "WARNING:cut_out_speed配置为空的")
+            warning(cut_out_speed, "WARNING:cut_out_speed配置为空的")
             self.cut_out_speed = 20
             self.cut_out_speed = 20
 
 
         if file_path is None and origin_df is None:
         if file_path is None and origin_df is None:
@@ -350,12 +350,12 @@ class ClassIdentifier(object):
     def run(self):
     def run(self):
         # Implement your class identification logic here
         # Implement your class identification logic here
         begin = datetime.datetime.now()
         begin = datetime.datetime.now()
-        trans_print("打标签开始,风机号:", self.wind_turbine_number, self.df.shape)
+        debug("打标签开始,风机号:", self.wind_turbine_number, self.df.shape)
         try:
         try:
             df = self.identifier()
             df = self.identifier()
         except Exception as e:
         except Exception as e:
-            trans_print(traceback.format_exc())
+            error(traceback.format_exc())
             message = str(e) + ',风机编号:' + self.wind_turbine_number
             message = str(e) + ',风机编号:' + self.wind_turbine_number
             raise Exception('打标签失败:' + message)
             raise Exception('打标签失败:' + message)
-        trans_print("打标签结束,", df.shape, ",耗时:", datetime.datetime.now() - begin)
+        debug("打标签结束,", df.shape, ",耗时:", datetime.datetime.now() - begin)
         return df
         return df

+ 51 - 26
etl/wind_power/min_sec/MinSecTrans.py

@@ -3,6 +3,7 @@
 # @Author  : 魏志亮
 # @Author  : 魏志亮
 import multiprocessing
 import multiprocessing
 import os.path
 import os.path
+from typing import Optional
 
 
 from etl.common.BaseDataTrans import BaseDataTrans
 from etl.common.BaseDataTrans import BaseDataTrans
 from etl.common.CombineAndSaveFormalFile import CombineAndSaveFormalFile
 from etl.common.CombineAndSaveFormalFile import CombineAndSaveFormalFile
@@ -12,26 +13,67 @@ from etl.wind_power.min_sec.TransParam import TransParam
 from service.trans_conf_service import update_trans_status_success, update_trans_status_error
 from service.trans_conf_service import update_trans_status_success, update_trans_status_error
 from service.trans_service import get_min_sec_conf
 from service.trans_service import get_min_sec_conf
 from utils.conf.read_conf import read_conf
 from utils.conf.read_conf import read_conf
-from utils.log.trans_log import trans_print
+from utils.log.trans_log import error
 
 
 
 
 class MinSecTrans(BaseDataTrans):
 class MinSecTrans(BaseDataTrans):
+    """分钟/秒级数据转换类"""
 
 
-    def __init__(self, data: dict = None, save_db=True, yaml_config=None, step=0, end=999):
+    # 转换列名列表
+    TRANS_COLS = [
+        'wind_turbine_number', 'time_stamp', 'active_power', 'rotor_speed', 'generator_speed',
+        'wind_velocity', 'pitch_angle_blade_1', 'pitch_angle_blade_2', 'pitch_angle_blade_3',
+        'cabin_position', 'true_wind_direction', 'yaw_error1', 'set_value_of_active_power',
+        'gearbox_oil_temperature', 'generatordrive_end_bearing_temperature',
+        'generatornon_drive_end_bearing_temperature', 'wind_turbine_status', 'wind_turbine_status2',
+        'cabin_temperature', 'twisted_cable_angle', 'front_back_vibration_of_the_cabin',
+        'side_to_side_vibration_of_the_cabin', 'actual_torque', 'given_torque',
+        'clockwise_yaw_count', 'counterclockwise_yaw_count', 'unusable', 'power_curve_available',
+        'required_gearbox_speed',
+        'inverter_speed_master_control', 'outside_cabin_temperature', 'main_bearing_temperature',
+        'main_bearing_temperature_2', 'gearbox_high_speed_shaft_bearing_temperature',
+        'gearboxmedium_speed_shaftbearing_temperature',
+        'gearbox_low_speed_shaft_bearing_temperature', 'generator_winding1_temperature',
+        'generator_winding2_temperature', 'generator_winding3_temperature',
+        'turbulence_intensity', 'grid_a_phase_current', 'grid_b_phase_current',
+        'grid_c_phase_current', 'reactive_power', 'param1', 'param2', 'param3', 'param4', 'param5',
+        'param6', 'param7', 'param8', 'param9', 'param10'
+    ]
+
+    def __init__(self, data: dict = None, save_db: bool = True, yaml_config: dict = None, step: int = 0,
+                 end: int = 999):
+        """
+        初始化分钟/秒级数据转换类
+        
+        Args:
+            data: 任务数据字典
+            save_db: 是否保存到数据库
+            yaml_config: YAML配置
+            step: 开始步骤
+            end: 结束步骤
+        """
         super(MinSecTrans, self).__init__(data, save_db, yaml_config, step, end)
         super(MinSecTrans, self).__init__(data, save_db, yaml_config, step, end)
         self.statistics_map = multiprocessing.Manager().dict()
         self.statistics_map = multiprocessing.Manager().dict()
         self.trans_param = self.get_trans_param()
         self.trans_param = self.get_trans_param()
         self.trans_param.wind_col_trans = self.wind_col_trans
         self.trans_param.wind_col_trans = self.wind_col_trans
 
 
     def get_filed_conf(self):
     def get_filed_conf(self):
+        """获取配置"""
         return get_min_sec_conf(self.wind_farm_code, self.transfer_type)
         return get_min_sec_conf(self.wind_farm_code, self.transfer_type)
 
 
-    def get_trans_param(self):
+    def get_trans_param(self) -> Optional[TransParam]:
+        """
+        获取转换参数
+        
+        Returns:
+            TransParam对象
+        """
         conf_map = self.get_filed_conf()
         conf_map = self.get_filed_conf()
         if conf_map is None or type(conf_map) == tuple or len(conf_map.keys()) == 0:
         if conf_map is None or type(conf_map) == tuple or len(conf_map.keys()) == 0:
             message = f"未找到{self.id}的{self.transfer_type}配置"
             message = f"未找到{self.id}的{self.transfer_type}配置"
-            trans_print(message)
+            error(message)
             update_trans_status_error(self.id, message, self.save_db)
             update_trans_status_error(self.id, message, self.save_db)
+            return None
         else:
         else:
             resolve_col_prefix = read_conf(conf_map, 'resolve_col_prefix')
             resolve_col_prefix = read_conf(conf_map, 'resolve_col_prefix')
             wind_name_exec = read_conf(conf_map, 'wind_name_exec', None)
             wind_name_exec = read_conf(conf_map, 'wind_name_exec', None)
@@ -48,25 +90,7 @@ class MinSecTrans(BaseDataTrans):
             boolean_sec_to_min = int(boolean_sec_to_min) == 1
             boolean_sec_to_min = int(boolean_sec_to_min) == 1
 
 
             cols_trans_all = dict()
             cols_trans_all = dict()
-            trans_cols = ['wind_turbine_number', 'time_stamp', 'active_power', 'rotor_speed', 'generator_speed',
-                          'wind_velocity', 'pitch_angle_blade_1', 'pitch_angle_blade_2', 'pitch_angle_blade_3',
-                          'cabin_position', 'true_wind_direction', 'yaw_error1', 'set_value_of_active_power',
-                          'gearbox_oil_temperature', 'generatordrive_end_bearing_temperature',
-                          'generatornon_drive_end_bearing_temperature', 'wind_turbine_status', 'wind_turbine_status2',
-                          'cabin_temperature', 'twisted_cable_angle', 'front_back_vibration_of_the_cabin',
-                          'side_to_side_vibration_of_the_cabin', 'actual_torque', 'given_torque',
-                          'clockwise_yaw_count', 'counterclockwise_yaw_count', 'unusable', 'power_curve_available',
-                          'required_gearbox_speed',
-                          'inverter_speed_master_control', 'outside_cabin_temperature', 'main_bearing_temperature',
-                          'main_bearing_temperature_2', 'gearbox_high_speed_shaft_bearing_temperature',
-                          'gearboxmedium_speed_shaftbearing_temperature',
-                          'gearbox_low_speed_shaft_bearing_temperature', 'generator_winding1_temperature',
-                          'generator_winding2_temperature', 'generator_winding3_temperature',
-                          'turbulence_intensity', 'grid_a_phase_current', 'grid_b_phase_current',
-                          'grid_c_phase_current', 'reactive_power', 'param1', 'param2', 'param3', 'param4', 'param5',
-                          'param6', 'param7', 'param8', 'param9', 'param10']
-
-            for col in trans_cols:
+            for col in self.TRANS_COLS:
                 cols_trans_all[col] = read_conf(conf_map, col, '')
                 cols_trans_all[col] = read_conf(conf_map, col, '')
 
 
             return TransParam(read_type=self.transfer_type, read_path=self.read_dir,
             return TransParam(read_type=self.transfer_type, read_path=self.read_dir,
@@ -77,13 +101,13 @@ class MinSecTrans(BaseDataTrans):
                               resolve_col_prefix=resolve_col_prefix, need_valid_cols=need_valid_cols,
                               resolve_col_prefix=resolve_col_prefix, need_valid_cols=need_valid_cols,
                               boolean_sec_to_min=boolean_sec_to_min)
                               boolean_sec_to_min=boolean_sec_to_min)
 
 
-    # 第三步 读取 并 保存到临时文件
     def read_and_save_tmp_file(self):
     def read_and_save_tmp_file(self):
+        """第三步:读取并保存到临时文件"""
         read_and_save_tmp = ReadAndSaveTmp(self.pathsAndTable, self.trans_param)
         read_and_save_tmp = ReadAndSaveTmp(self.pathsAndTable, self.trans_param)
         read_and_save_tmp.run()
         read_and_save_tmp.run()
 
 
-    # 第四步 统计 并 保存到正式文件
     def statistics_and_save_tmp_formal_file(self):
     def statistics_and_save_tmp_formal_file(self):
+        """第四步:统计并保存到正式文件"""
         # 保存到正式文件
         # 保存到正式文件
         statistics_and_save_tmp_formal_file = StatisticsAndSaveTmpFormalFile(self.pathsAndTable, self.trans_param,
         statistics_and_save_tmp_formal_file = StatisticsAndSaveTmpFormalFile(self.pathsAndTable, self.trans_param,
                                                                              self.statistics_map,
                                                                              self.statistics_map,
@@ -91,11 +115,12 @@ class MinSecTrans(BaseDataTrans):
         statistics_and_save_tmp_formal_file.run()
         statistics_and_save_tmp_formal_file.run()
 
 
     def combine_and_save_formal_file(self):
     def combine_and_save_formal_file(self):
+        """合并并保存正式文件"""
         combine_and_save_formal_file = CombineAndSaveFormalFile(self.pathsAndTable)
         combine_and_save_formal_file = CombineAndSaveFormalFile(self.pathsAndTable)
         self.update_files = combine_and_save_formal_file.run()
         self.update_files = combine_and_save_formal_file.run()
 
 
-    # 最后更新执行程度
     def update_exec_progress(self):
     def update_exec_progress(self):
+        """最后更新执行进度"""
         all_files = set([os.path.basename(i) for i in self.update_files])
         all_files = set([os.path.basename(i) for i in self.update_files])
         update_trans_status_success(self.id, len(all_files),
         update_trans_status_success(self.id, len(all_files),
                                     self.statistics_map['time_granularity'],
                                     self.statistics_map['time_granularity'],

+ 111 - 45
etl/wind_power/min_sec/ReadAndSaveTmp.py

@@ -1,31 +1,47 @@
 import datetime
 import datetime
 import multiprocessing
 import multiprocessing
+import os
 import traceback
 import traceback
-from os import *
 
 
 import pandas as pd
 import pandas as pd
 
 
+from conf.constants import ParallelProcessing
 from etl.common.PathsAndTable import PathsAndTable
 from etl.common.PathsAndTable import PathsAndTable
 from etl.wind_power.min_sec import TransParam
 from etl.wind_power.min_sec import TransParam
 from service.trans_conf_service import update_trans_transfer_progress
 from service.trans_conf_service import update_trans_transfer_progress
 from utils.file.trans_methods import read_excel_files, split_array, del_blank, \
 from utils.file.trans_methods import read_excel_files, split_array, del_blank, \
     create_file_path, read_file_to_df, valid_eval
     create_file_path, read_file_to_df, valid_eval
-from utils.log.trans_log import trans_print
+from utils.log.trans_log import info, debug, error
 from utils.systeminfo.sysinfo import use_files_get_max_cpu_count, get_dir_size
 from utils.systeminfo.sysinfo import use_files_get_max_cpu_count, get_dir_size
 
 
 
 
 class ReadAndSaveTmp(object):
 class ReadAndSaveTmp(object):
+    """读取并保存临时文件类"""
 
 
     def __init__(self, pathsAndTable: PathsAndTable, trans_param: TransParam):
     def __init__(self, pathsAndTable: PathsAndTable, trans_param: TransParam):
+        """
+        初始化读取并保存临时文件类
+        
+        Args:
+            pathsAndTable: 路径和表对象
+            trans_param: 转换参数对象
+        """
         self.pathsAndTable = pathsAndTable
         self.pathsAndTable = pathsAndTable
         self.trans_param = trans_param
         self.trans_param = trans_param
         self.exist_wind_names = multiprocessing.Manager().list()
         self.exist_wind_names = multiprocessing.Manager().list()
         self.lock = multiprocessing.Manager().Lock()
         self.lock = multiprocessing.Manager().Lock()
         self.file_lock = multiprocessing.Manager().dict()
         self.file_lock = multiprocessing.Manager().dict()
 
 
-    def _save_to_tmp_csv_by_name(self, df, name):
+    def _save_to_tmp_csv_by_name(self, df: pd.DataFrame, name: str):
+        """
+        根据风机名称保存到临时CSV文件
+        
+        Args:
+            df: 数据帧
+            name: 风机名称
+        """
         save_name = str(name) + '.csv'
         save_name = str(name) + '.csv'
-        save_path = path.join(self.pathsAndTable.get_read_tmp_path(), save_name)
+        save_path = os.path.join(self.pathsAndTable.get_read_tmp_path(), save_name)
         create_file_path(save_path, is_file_path=True)
         create_file_path(save_path, is_file_path=True)
 
 
         with self.lock:
         with self.lock:
@@ -41,7 +57,13 @@ class ReadAndSaveTmp(object):
             else:
             else:
                 df.to_csv(save_path, index=False, encoding='utf8')
                 df.to_csv(save_path, index=False, encoding='utf8')
 
 
-    def save_merge_data(self, file_path):
+    def save_merge_data(self, file_path: str):
+        """
+        保存合并数据
+        
+        Args:
+            file_path: 文件路径
+        """
         df = self.read_excel_to_df(file_path)
         df = self.read_excel_to_df(file_path)
         if self.trans_param.wind_name_exec:
         if self.trans_param.wind_name_exec:
             if valid_eval(self.trans_param.wind_name_exec):
             if valid_eval(self.trans_param.wind_name_exec):
@@ -67,7 +89,7 @@ class ReadAndSaveTmp(object):
                         else:
                         else:
                             contains_name = False
                             contains_name = False
                             self.exist_wind_names.append(exist_name)
                             self.exist_wind_names.append(exist_name)
-                        save_path = path.join(merge_path, csv_name)
+                        save_path = os.path.join(merge_path, csv_name)
                         now_df = df[df['wind_turbine_number'] == wind_name][['time_stamp', col]]
                         now_df = df[df['wind_turbine_number'] == wind_name][['time_stamp', col]]
                         if contains_name:
                         if contains_name:
                             now_df.to_csv(save_path, index=False, encoding='utf-8', mode='a',
                             now_df.to_csv(save_path, index=False, encoding='utf-8', mode='a',
@@ -75,7 +97,16 @@ class ReadAndSaveTmp(object):
                         else:
                         else:
                             now_df.to_csv(save_path, index=False, encoding='utf-8')
                             now_df.to_csv(save_path, index=False, encoding='utf-8')
 
 
-    def trans_df_cols(self, df):
+    def trans_df_cols(self, df: pd.DataFrame) -> pd.DataFrame:
+        """
+        转换数据帧列名
+        
+        Args:
+            df: 数据帧
+        
+        Returns:
+            转换后的数据帧
+        """
         if self.trans_param.is_vertical_table:
         if self.trans_param.is_vertical_table:
             pass
             pass
         else:
         else:
@@ -120,8 +151,13 @@ class ReadAndSaveTmp(object):
 
 
         return df
         return df
 
 
-    def df_save_to_tmp_file(self, df=pd.DataFrame()):
-
+    def df_save_to_tmp_file(self, df: pd.DataFrame = pd.DataFrame()):
+        """
+        保存数据帧到临时文件
+        
+        Args:
+            df: 数据帧
+        """
         df = self.trans_df_cols(df)
         df = self.trans_df_cols(df)
 
 
         df = del_blank(df, ['wind_turbine_number'])
         df = del_blank(df, ['wind_turbine_number'])
@@ -133,19 +169,34 @@ class ReadAndSaveTmp(object):
 
 
         self.save_to_tmp_csv(df)
         self.save_to_tmp_csv(df)
 
 
-    def save_to_tmp_csv(self, df):
+    def save_to_tmp_csv(self, df: pd.DataFrame):
+        """
+        保存到临时CSV文件
+        
+        Args:
+            df: 数据帧
+        """
         names = set(df['wind_turbine_number'].values)
         names = set(df['wind_turbine_number'].values)
         if names:
         if names:
-            trans_print("开始保存", str(names), "到临时文件", df.shape)
+            debug("开始保存", str(names), "到临时文件", df.shape)
 
 
             for name in names:
             for name in names:
                 self._save_to_tmp_csv_by_name(df[df['wind_turbine_number'] == name], name)
                 self._save_to_tmp_csv_by_name(df[df['wind_turbine_number'] == name], name)
             del df
             del df
-            trans_print("保存", str(names), "到临时文件成功, 风机数量", len(names))
-
-    def merge_df(self, dir_path):
+            debug("保存", str(names), "到临时文件成功, 风机数量", len(names))
+
+    def merge_df(self, dir_path: str) -> pd.DataFrame:
+        """
+        合并数据帧
+        
+        Args:
+            dir_path: 目录路径
+        
+        Returns:
+            合并后的数据帧
+        """
         all_files = read_excel_files(dir_path)
         all_files = read_excel_files(dir_path)
-        wind_turbine_number = path.basename(dir_path)
+        wind_turbine_number = os.path.basename(dir_path)
         df = pd.DataFrame()
         df = pd.DataFrame()
         for file in all_files:
         for file in all_files:
             now_df = read_file_to_df(file)
             now_df = read_file_to_df(file)
@@ -161,8 +212,13 @@ class ReadAndSaveTmp(object):
         return df
         return df
 
 
     def read_file_and_save_tmp(self):
     def read_file_and_save_tmp(self):
+        """
+        读取文件并保存到临时文件
+        """
         all_files = read_excel_files(self.pathsAndTable.get_excel_tmp_path())
         all_files = read_excel_files(self.pathsAndTable.get_excel_tmp_path())
         split_count = use_files_get_max_cpu_count(all_files)
         split_count = use_files_get_max_cpu_count(all_files)
+        # 限制最大进程数
+        split_count = min(split_count, ParallelProcessing.MAX_PROCESSES)
         all_arrays = split_array(all_files, split_count)
         all_arrays = split_array(all_files, split_count)
 
 
         if self.trans_param.merge_columns:
         if self.trans_param.merge_columns:
@@ -172,7 +228,7 @@ class ReadAndSaveTmp(object):
                         pool.starmap(self.save_merge_data, [(ar,) for ar in arr])
                         pool.starmap(self.save_merge_data, [(ar,) for ar in arr])
 
 
                 except Exception as e:
                 except Exception as e:
-                    trans_print(traceback.format_exc())
+                    error(traceback.format_exc())
                     message = "整理临时文件,系统返回错误:" + str(e)
                     message = "整理临时文件,系统返回错误:" + str(e)
                     raise ValueError(message)
                     raise ValueError(message)
 
 
@@ -180,28 +236,28 @@ class ReadAndSaveTmp(object):
                                                round(20 + 20 * (index + 1) / len(all_arrays), 2),
                                                round(20 + 20 * (index + 1) / len(all_arrays), 2),
                                                self.pathsAndTable.save_db)
                                                self.pathsAndTable.save_db)
 
 
-            dirs = [path.join(self.pathsAndTable.get_merge_tmp_path(), dir_name) for dir_name in
-                    listdir(self.pathsAndTable.get_merge_tmp_path())]
-            dir_total_size = get_dir_size(dirs[0])
-            # split_count = max_file_size_get_max_cpu_count(dir_total_size, memory_percent=1 / 12, cpu_percent=1 / 10)
-            split_count = 2
-            all_arrays = split_array(dirs, split_count)
-            for index, arr in enumerate(all_arrays):
-                try:
-                    with multiprocessing.Pool(split_count) as pool:
-                        pool.starmap(self.merge_df, [(ar,) for ar in arr])
-
-                except Exception as e:
-                    trans_print(traceback.format_exc())
-                    message = "整理临时文件,系统返回错误:" + str(e)
-                    raise ValueError(message)
-
-                update_trans_transfer_progress(self.pathsAndTable.id,
-                                               round(20 + 30 * (index + 1) / len(all_arrays), 2),
-                                               self.pathsAndTable.save_db)
+            dirs = [os.path.join(self.pathsAndTable.get_merge_tmp_path(), dir_name) for dir_name in
+                    os.listdir(self.pathsAndTable.get_merge_tmp_path())]
+            if dirs:
+                dir_total_size = get_dir_size(dirs[0])
+                # 限制最大进程数
+                split_count = min(dir_total_size, ParallelProcessing.MAX_PROCESSES)
+                all_arrays = split_array(dirs, split_count)
+                for index, arr in enumerate(all_arrays):
+                    try:
+                        with multiprocessing.Pool(split_count) as pool:
+                            pool.starmap(self.merge_df, [(ar,) for ar in arr])
+
+                    except Exception as e:
+                        error(traceback.format_exc())
+                        message = "整理临时文件,系统返回错误:" + str(e)
+                        raise ValueError(message)
+
+                    update_trans_transfer_progress(self.pathsAndTable.id,
+                                                   round(20 + 30 * (index + 1) / len(all_arrays), 2),
+                                                   self.pathsAndTable.save_db)
 
 
         else:
         else:
-
             for index, arr in enumerate(all_arrays):
             for index, arr in enumerate(all_arrays):
                 try:
                 try:
                     with multiprocessing.Pool(split_count) as pool:
                     with multiprocessing.Pool(split_count) as pool:
@@ -209,7 +265,7 @@ class ReadAndSaveTmp(object):
                     for df in dfs:
                     for df in dfs:
                         self.df_save_to_tmp_file(df)
                         self.df_save_to_tmp_file(df)
                 except Exception as e:
                 except Exception as e:
-                    trans_print(traceback.format_exc())
+                    error(traceback.format_exc())
                     message = "整理临时文件,系统返回错误:" + str(e)
                     message = "整理临时文件,系统返回错误:" + str(e)
                     raise ValueError(message)
                     raise ValueError(message)
 
 
@@ -217,8 +273,16 @@ class ReadAndSaveTmp(object):
                                                round(20 + 30 * (index + 1) / len(all_arrays), 2),
                                                round(20 + 30 * (index + 1) / len(all_arrays), 2),
                                                self.pathsAndTable.save_db)
                                                self.pathsAndTable.save_db)
 
 
-    def read_excel_to_df(self, file_path):
-
+    def read_excel_to_df(self, file_path: str) -> pd.DataFrame:
+        """
+        读取Excel文件到数据帧
+        
+        Args:
+            file_path: 文件路径
+        
+        Returns:
+            数据帧
+        """
         read_cols = [v.split(",")[0] for k, v in self.trans_param.cols_tran.items() if v and not v.startswith("$")]
         read_cols = [v.split(",")[0] for k, v in self.trans_param.cols_tran.items() if v and not v.startswith("$")]
 
 
         trans_dict = {}
         trans_dict = {}
@@ -300,7 +364,7 @@ class ReadAndSaveTmp(object):
 
 
             for k, v in trans_dict.items():
             for k, v in trans_dict.items():
                 if k.startswith("$file"):
                 if k.startswith("$file"):
-                    file = ".".join(path.basename(file_path).split(".")[0:-1])
+                    file = ".".join(os.path.basename(file_path).split(".")[0:-1])
                     if k == "$file":
                     if k == "$file":
                         ks = k.split("|")
                         ks = k.split("|")
                         bool_contains = False
                         bool_contains = False
@@ -337,7 +401,7 @@ class ReadAndSaveTmp(object):
                     datas = str(k.split(",")[1].replace("$file_date", "").replace("[", "").replace("]", "")).split(":")
                     datas = str(k.split(",")[1].replace("$file_date", "").replace("[", "").replace("]", "")).split(":")
                     if len(datas) != 2:
                     if len(datas) != 2:
                         raise Exception("字段映射出现错误 :" + str(trans_dict))
                         raise Exception("字段映射出现错误 :" + str(trans_dict))
-                    file = ".".join(path.basename(file_path).split(".")[0:-1])
+                    file = ".".join(os.path.basename(file_path).split(".")[0:-1])
                     date_str = str(file[int(datas[0]):int(datas[1])]).strip()
                     date_str = str(file[int(datas[0]):int(datas[1])]).strip()
                     df[v] = df[k.split(",")[0]].apply(lambda x: date_str + " " + str(x))
                     df[v] = df[k.split(",")[0]].apply(lambda x: date_str + " " + str(x))
 
 
@@ -351,8 +415,8 @@ class ReadAndSaveTmp(object):
                     if not bool_contains:
                     if not bool_contains:
                         cengshu = int(str(ks[0].replace("$folder", "").replace("[", "").replace("]", "")))
                         cengshu = int(str(ks[0].replace("$folder", "").replace("[", "").replace("]", "")))
                         for i in range(cengshu):
                         for i in range(cengshu):
-                            folder = path.dirname(folder)
-                        df[v] = str(str(folder).split(sep)[-1]).strip()
+                            folder = os.path.dirname(folder)
+                        df[v] = str(str(folder).split(os.sep)[-1]).strip()
                 elif k.startswith("$sheet_name"):
                 elif k.startswith("$sheet_name"):
                     df[v] = df['sheet_name']
                     df[v] = df['sheet_name']
 
 
@@ -374,9 +438,11 @@ class ReadAndSaveTmp(object):
             return df
             return df
 
 
     def run(self):
     def run(self):
-        trans_print("开始保存数据到临时文件")
+        """
+        """
+        info("开始保存数据到临时文件")
         begin = datetime.datetime.now()
         begin = datetime.datetime.now()
         self.read_file_and_save_tmp()
         self.read_file_and_save_tmp()
         update_trans_transfer_progress(self.pathsAndTable.id, 50,
         update_trans_transfer_progress(self.pathsAndTable.id, 50,
                                        self.pathsAndTable.save_db)
                                        self.pathsAndTable.save_db)
-        trans_print("保存数据到临时文件结束,耗时:", datetime.datetime.now() - begin)
+        info("保存数据到临时文件结束,耗时:", datetime.datetime.now() - begin)

+ 53 - 40
etl/wind_power/min_sec/StatisticsAndSaveTmpFormalFile.py

@@ -5,15 +5,16 @@ from os import path
 import numpy as np
 import numpy as np
 import pandas as pd
 import pandas as pd
 
 
+from conf.constants import DataProcessing, ParallelProcessing, Types
 from etl.common.PathsAndTable import PathsAndTable
 from etl.common.PathsAndTable import PathsAndTable
 from etl.wind_power.min_sec import TransParam
 from etl.wind_power.min_sec import TransParam
 from etl.wind_power.min_sec.ClassIdentifier import ClassIdentifier
 from etl.wind_power.min_sec.ClassIdentifier import ClassIdentifier
 from etl.wind_power.min_sec.FilterValidData import FilterValidData
 from etl.wind_power.min_sec.FilterValidData import FilterValidData
 from service.trans_conf_service import update_trans_transfer_progress
 from service.trans_conf_service import update_trans_transfer_progress
 from utils.conf.read_conf import read_conf
 from utils.conf.read_conf import read_conf
-from utils.df_utils.util import get_time_space
-from utils.file.trans_methods import create_file_path, read_excel_files, read_file_to_df, split_array
-from utils.log.trans_log import trans_print
+from utils.df_utils.util import estimate_time_interval as get_time_space
+from utils.file.trans_methods import create_file_path, read_excel_files, read_file_to_df
+from utils.log.trans_log import debug, error
 from utils.systeminfo.sysinfo import use_files_get_max_cpu_count
 from utils.systeminfo.sysinfo import use_files_get_max_cpu_count
 
 
 exec("import math")
 exec("import math")
@@ -74,13 +75,12 @@ class StatisticsAndSaveTmpFormalFile(object):
             self.trans_param.wind_col_trans).fillna(df['wind_turbine_number'])
             self.trans_param.wind_col_trans).fillna(df['wind_turbine_number'])
         wind_col_name = str(df['wind_turbine_number'].values[0])
         wind_col_name = str(df['wind_turbine_number'].values[0])
 
 
-        not_double_cols = ['wind_turbine_number', 'wind_turbine_name', 'time_stamp', 'param6', 'param7', 'param8',
-                           'param9', 'param10']
+        not_double_cols = DataProcessing.NOT_DOUBLE_COLS
 
 
         # 删除 有功功率 和 风速均为空的情况
         # 删除 有功功率 和 风速均为空的情况
         df.dropna(subset=['active_power', 'wind_velocity'], how='any', inplace=True)
         df.dropna(subset=['active_power', 'wind_velocity'], how='any', inplace=True)
-        trans_print(origin_wind_name, wind_col_name, "删除有功功率和风速有空的情况后:", df.shape)
-        df.replace(np.nan, -999999999, inplace=True)
+        debug(origin_wind_name, wind_col_name, "删除有功功率和风速有空的情况后:", df.shape)
+        df.replace(np.nan, DataProcessing.NAN_REPLACE_VALUE, inplace=True)
         number_cols = df.select_dtypes(include=['number']).columns.tolist()
         number_cols = df.select_dtypes(include=['number']).columns.tolist()
         for col in df.columns:
         for col in df.columns:
             if col not in not_double_cols and col not in number_cols:
             if col not in not_double_cols and col not in number_cols:
@@ -88,8 +88,8 @@ class StatisticsAndSaveTmpFormalFile(object):
                     df[col] = pd.to_numeric(df[col], errors='coerce')
                     df[col] = pd.to_numeric(df[col], errors='coerce')
                     # 删除包含NaN的行(即那些列A转换失败的行)
                     # 删除包含NaN的行(即那些列A转换失败的行)
                     df = df.dropna(subset=[col])
                     df = df.dropna(subset=[col])
-                    trans_print(origin_wind_name, wind_col_name, "删除非数值列名:", col)
-        df.replace(-999999999, np.nan, inplace=True)
+                    debug(origin_wind_name, wind_col_name, "删除非数值列名:", col)
+        df.replace(DataProcessing.NAN_REPLACE_VALUE, np.nan, inplace=True)
 
 
         df.drop_duplicates(['wind_turbine_number', 'time_stamp'], keep='first', inplace=True)
         df.drop_duplicates(['wind_turbine_number', 'time_stamp'], keep='first', inplace=True)
 
 
@@ -102,40 +102,40 @@ class StatisticsAndSaveTmpFormalFile(object):
         # 删除每行有空值的行(2025-3-24)
         # 删除每行有空值的行(2025-3-24)
         # origin_count = df.shape[0]
         # origin_count = df.shape[0]
         # df = df.dropna()
         # df = df.dropna()
-        # trans_print(f'原始数据量:{origin_count},去除na后数据量:{df.shape[0]}')
+        # trans_print(f"原始数据量:{origin_count},去除na后数据量:{df.shape[0]}")
 
 
         # 如果秒级有可能合并到分钟级
         # 如果秒级有可能合并到分钟级
         # TODO add 秒转分钟
         # TODO add 秒转分钟
         if self.trans_param.boolean_sec_to_min:
         if self.trans_param.boolean_sec_to_min:
             df['time_stamp'] = df['time_stamp'].apply(lambda x: x + pd.Timedelta(minutes=(10 - x.minute % 10) % 10))
             df['time_stamp'] = df['time_stamp'].apply(lambda x: x + pd.Timedelta(minutes=(10 - x.minute % 10) % 10))
-            df['time_stamp'] = df['time_stamp'].dt.floor('10T')
+            df['time_stamp'] = df['time_stamp'].dt.floor(DataProcessing.TIME_INTERVAL)
             df = df.groupby(['wind_turbine_number', 'time_stamp']).mean().reset_index()
             df = df.groupby(['wind_turbine_number', 'time_stamp']).mean().reset_index()
-        trans_print('有功功率前10个', df.head(10)['active_power'].values)
+        debug('有功功率前10个', df.head(10)['active_power'].values)
         power_df = df[df['active_power'] > 0]
         power_df = df[df['active_power'] > 0]
-        trans_print(origin_wind_name, wind_col_name, "功率大于0的数量:", power_df.shape)
+        debug(origin_wind_name, wind_col_name, "功率大于0的数量:", power_df.shape)
         power = power_df.sample(int(power_df.shape[0] / 100))['active_power'].median()
         power = power_df.sample(int(power_df.shape[0] / 100))['active_power'].median()
 
 
-        trans_print(origin_wind_name, wind_col_name, '有功功率,中位数', power)
-        if power > 100000:
+        debug(origin_wind_name, wind_col_name, '有功功率,中位数', power)
+        if power > DataProcessing.POWER_UNIT_THRESHOLD:
             df['active_power'] = df['active_power'] / 1000
             df['active_power'] = df['active_power'] / 1000
-        ## 做数据检测前,羡强行处理有功功率
+        # 做数据检测前,羡强行处理有功功率
         # df = df[df['active_power'] < 50000]
         # df = df[df['active_power'] < 50000]
 
 
         rated_power_and_cutout_speed_tuple = read_conf(self.rated_power_and_cutout_speed_map, str(wind_col_name))
         rated_power_and_cutout_speed_tuple = read_conf(self.rated_power_and_cutout_speed_map, str(wind_col_name))
         if rated_power_and_cutout_speed_tuple is None:
         if rated_power_and_cutout_speed_tuple is None:
-            rated_power_and_cutout_speed_tuple = (None, None)
-            trans_print(origin_wind_name, '未从平台匹配到额定功率')
+            # rated_power_and_cutout_speed_tuple = (None, None)
+            error(origin_wind_name, '未从平台匹配到额定功率')
         else:
         else:
-            trans_print(origin_wind_name, '过滤数据前数据大小', df.shape)
-            trans_print(origin_wind_name, '额定功率', rated_power_and_cutout_speed_tuple[0])
+            debug(origin_wind_name, '过滤数据前数据大小', df.shape)
+            debug(origin_wind_name, '额定功率', rated_power_and_cutout_speed_tuple[0])
             # trans_print(origin_wind_name, '\n', df.head(10))
             # trans_print(origin_wind_name, '\n', df.head(10))
             filter_valid_data = FilterValidData(df, rated_power_and_cutout_speed_tuple[0])
             filter_valid_data = FilterValidData(df, rated_power_and_cutout_speed_tuple[0])
             try:
             try:
                 df = filter_valid_data.run()
                 df = filter_valid_data.run()
             except:
             except:
-                trans_print(origin_wind_name, '过滤数据异常', filename)
+                error(origin_wind_name, '过滤数据异常', filename)
                 raise
                 raise
-            trans_print(origin_wind_name, '过滤数据后数据大小', df.shape)
+            debug(origin_wind_name, '过滤数据后数据大小', df.shape)
 
 
             # 如果有需要处理的,先进行代码处理,在进行打标签
             # 如果有需要处理的,先进行代码处理,在进行打标签
             # exec_code = get_trans_exec_code(self.paths_and_table.exec_id, self.paths_and_table.read_type)
             # exec_code = get_trans_exec_code(self.paths_and_table.exec_id, self.paths_and_table.read_type)
@@ -147,10 +147,10 @@ class StatisticsAndSaveTmpFormalFile(object):
             if power_df.shape[0] == 0:
             if power_df.shape[0] == 0:
                 df.loc[:, 'lab'] = -1
                 df.loc[:, 'lab'] = -1
             else:
             else:
-                class_identifiler = ClassIdentifier(wind_turbine_number=origin_wind_name, origin_df=df,
-                                                    rated_power=rated_power_and_cutout_speed_tuple[0],
-                                                    cut_out_speed=rated_power_and_cutout_speed_tuple[1])
-                df = class_identifiler.run()
+                class_identifier = ClassIdentifier(wind_turbine_number=origin_wind_name, origin_df=df,
+                                                   rated_power=rated_power_and_cutout_speed_tuple[0],
+                                                   cut_out_speed=rated_power_and_cutout_speed_tuple[1])
+                df = class_identifier.run()
 
 
             del power_df
             del power_df
 
 
@@ -163,7 +163,7 @@ class StatisticsAndSaveTmpFormalFile(object):
             df['year_month'] = df[['year', 'month']].apply(lambda x: str(x['year']) + str(x['month']).zfill(2), axis=1)
             df['year_month'] = df[['year', 'month']].apply(lambda x: str(x['year']) + str(x['month']).zfill(2), axis=1)
             cols = df.columns
             cols = df.columns
 
 
-            if self.paths_and_table.read_type == 'second':
+            if self.paths_and_table.read_type == Types.SECOND:
                 type_col = 'year_month'
                 type_col = 'year_month'
             else:
             else:
                 type_col = 'year'
                 type_col = 'year'
@@ -185,29 +185,42 @@ class StatisticsAndSaveTmpFormalFile(object):
             self.set_statistics_data(df)
             self.set_statistics_data(df)
 
 
             del df
             del df
-            trans_print("保存" + str(wind_col_name) + "成功")
+            debug("保存" + str(wind_col_name) + "成功")
 
 
-    def mutiprocessing_to_save_file(self):
+    def multiprocessing_to_save_file(self):
         # 开始保存到正式文件
         # 开始保存到正式文件
         all_tmp_files = read_excel_files(self.paths_and_table.get_read_tmp_path())
         all_tmp_files = read_excel_files(self.paths_and_table.get_read_tmp_path())
-        # split_count = self.pathsAndTable.multi_pool_count
-        split_count = use_files_get_max_cpu_count(all_tmp_files)
-        all_arrays = split_array(all_tmp_files, split_count)
+
+        if not all_tmp_files:
+            debug("没有临时文件需要处理")
+            return
+
+        # 计算最佳进程数
+        max_processes = use_files_get_max_cpu_count(all_tmp_files)
+        max_processes = min(max_processes, len(all_tmp_files), ParallelProcessing.MAX_PROCESSES)  # 限制最大进程数
 
 
         try:
         try:
-            for index, arr in enumerate(all_arrays):
-                with multiprocessing.Pool(split_count) as pool:
-                    pool.starmap(self.save_to_csv, [(i,) for i in arr])
-                update_trans_transfer_progress(self.paths_and_table.id,
-                                               round(50 + 15 * (index + 1) / len(all_arrays), 2),
-                                               self.paths_and_table.save_db)
+            # 创建一个进程池处理所有文件
+            with multiprocessing.Pool(max_processes) as pool:
+                # 分批次处理并更新进度
+                batch_size = max(1, len(all_tmp_files) // ParallelProcessing.MAX_BATCHES)  # 最多10个批次
+
+                for i in range(0, len(all_tmp_files), batch_size):
+                    batch_files = all_tmp_files[i:i + batch_size]
+                    pool.starmap(self.save_to_csv, [(file,) for file in batch_files])
+
+                    # 更新进度
+                    progress = 50 + 15 * (i + len(batch_files)) / len(all_tmp_files)
+                    update_trans_transfer_progress(self.paths_and_table.id,
+                                                   round(progress, 2),
+                                                   self.paths_and_table.save_db)
 
 
         except Exception as e:
         except Exception as e:
-            trans_print(traceback.format_exc())
+            error(traceback.format_exc())
             message = "保存文件错误,系统返回错误:" + str(e)
             message = "保存文件错误,系统返回错误:" + str(e)
             raise ValueError(message)
             raise ValueError(message)
 
 
     def run(self):
     def run(self):
-        self.mutiprocessing_to_save_file()
+        self.multiprocessing_to_save_file()
         update_trans_transfer_progress(self.paths_and_table.id, 65,
         update_trans_transfer_progress(self.paths_and_table.id, 65,
                                        self.paths_and_table.save_db)
                                        self.paths_and_table.save_db)

+ 42 - 7
etl/wind_power/min_sec/TransParam.py

@@ -1,23 +1,58 @@
 # -*- coding: utf-8 -*-
 # -*- coding: utf-8 -*-
 # @Time    : 2024/5/16
 # @Time    : 2024/5/16
 # @Author  : 魏志亮
 # @Author  : 魏志亮
+from typing import Optional, Dict, List
 
 
 
 
 class TransParam(object):
 class TransParam(object):
+    """转换参数类
+    
+    存储数据转换过程中的各种参数配置
+    """
 
 
-    def __init__(self, read_type=None, read_path=None, cols_tran=dict(),
-                 wind_name_exec=str(), is_vertical_table=False, vertical_cols=list(), vertical_key=None,
-                 vertical_value=None, index_cols=list(), merge_columns=False, resolve_col_prefix=None,
-                 need_valid_cols=True, wind_col_trans: dict = None, boolean_sec_to_min=False):
+    def __init__(self, 
+                 read_type: Optional[str] = None, 
+                 read_path: Optional[str] = None, 
+                 cols_tran: Dict[str, str] = None,
+                 wind_name_exec: str = "", 
+                 is_vertical_table: bool = False, 
+                 vertical_cols: List[str] = None,
+                 vertical_key: Optional[str] = None,
+                 vertical_value: Optional[str] = None, 
+                 index_cols: List[str] = None, 
+                 merge_columns: bool = False, 
+                 resolve_col_prefix: Optional[str] = None,
+                 need_valid_cols: bool = True, 
+                 wind_col_trans: Optional[Dict[str, str]] = None, 
+                 boolean_sec_to_min: bool = False):
+        """
+        初始化转换参数
+        
+        Args:
+            read_type: 读取类型,如 'second' 或 'minute'
+            read_path: 读取路径
+            cols_tran: 列名转换映射
+            wind_name_exec: 风机名称处理表达式
+            is_vertical_table: 是否为垂直表
+            vertical_cols: 垂直表列名列表
+            vertical_key: 垂直表键列
+            vertical_value: 垂直表值列
+            index_cols: 索引列列表
+            merge_columns: 是否合并列
+            resolve_col_prefix: 列名前缀解析表达式
+            need_valid_cols: 是否需要验证列
+            wind_col_trans: 风机列转换映射
+            boolean_sec_to_min: 是否将秒级数据转换为分钟级
+        """
         self.read_type = read_type
         self.read_type = read_type
         self.read_path = read_path
         self.read_path = read_path
-        self.cols_tran = cols_tran
+        self.cols_tran = cols_tran or {}
         self.is_vertical_table = is_vertical_table
         self.is_vertical_table = is_vertical_table
         self.wind_name_exec = wind_name_exec
         self.wind_name_exec = wind_name_exec
-        self.vertical_cols = vertical_cols
+        self.vertical_cols = vertical_cols or []
         self.vertical_key = vertical_key
         self.vertical_key = vertical_key
         self.vertical_value = vertical_value
         self.vertical_value = vertical_value
-        self.index_cols = index_cols
+        self.index_cols = index_cols or []
         self.merge_columns = merge_columns
         self.merge_columns = merge_columns
         self.resolve_col_prefix = resolve_col_prefix
         self.resolve_col_prefix = resolve_col_prefix
         self.need_valid_cols = need_valid_cols
         self.need_valid_cols = need_valid_cols

+ 53 - 18
etl/wind_power/wave/WaveTrans.py

@@ -1,14 +1,16 @@
 import json
 import json
 import multiprocessing
 import multiprocessing
 import traceback
 import traceback
+from typing import Tuple
 
 
+from conf.constants import ParallelProcessing, Types
 from service.plt_service import get_all_wind
 from service.plt_service import get_all_wind
 from service.trans_conf_service import update_trans_status_running, update_trans_transfer_progress, \
 from service.trans_conf_service import update_trans_status_running, update_trans_transfer_progress, \
     update_trans_status_success, update_trans_status_error
     update_trans_status_success, update_trans_status_error
 from service.trans_service import get_wave_conf, save_df_to_db, get_or_create_wave_table, \
 from service.trans_service import get_wave_conf, save_df_to_db, get_or_create_wave_table, \
     get_wave_data, delete_exist_wave_data
     get_wave_data, delete_exist_wave_data
 from utils.file.trans_methods import *
 from utils.file.trans_methods import *
-from utils.log.trans_log import set_trance_id
+from utils.log.trans_log import set_trance_id, info, error
 from utils.systeminfo.sysinfo import get_available_cpu_count_with_percent
 from utils.systeminfo.sysinfo import get_available_cpu_count_with_percent
 
 
 exec("from os.path import *")
 exec("from os.path import *")
@@ -16,8 +18,17 @@ exec("import re")
 
 
 
 
 class WaveTrans(object):
 class WaveTrans(object):
-
-    def __init__(self, id, wind_farm_code, read_dir):
+    """波形数据转换类"""
+
+    def __init__(self, id: int, wind_farm_code: str, read_dir: str):
+        """
+        初始化波形数据转换类
+        
+        Args:
+            id: 任务ID
+            wind_farm_code: 风电场编码
+            read_dir: 读取目录
+        """
         self.id = id
         self.id = id
         self.wind_farm_code = wind_farm_code
         self.wind_farm_code = wind_farm_code
         self.read_dir = read_dir
         self.read_dir = read_dir
@@ -28,11 +39,28 @@ class WaveTrans(object):
         self.max_date = None
         self.max_date = None
         self.data_count = 0
         self.data_count = 0
 
 
-    def get_data_exec(self, func_code, filepath, measupoint_names: set):
+    def get_data_exec(self, func_code: str, filepath: str, measupoint_names: List[str]) -> Optional[Tuple]:
+        """
+        执行数据获取函数
+        
+        Args:
+            func_code: 函数代码
+            filepath: 文件路径
+            measupoint_names: 测量点名称列表
+        
+        Returns:
+            数据元组
+        """
         exec(func_code)
         exec(func_code)
         return locals()['get_data'](filepath, measupoint_names)
         return locals()['get_data'](filepath, measupoint_names)
 
 
-    def del_exists_data(self, df):
+    def del_exists_data(self, df: pd.DataFrame):
+        """
+        删除已存在的数据
+        
+        Args:
+            df: 数据帧
+        """
         min_date, max_date = df['time_stamp'].min(), df['time_stamp'].max()
         min_date, max_date = df['time_stamp'].min(), df['time_stamp'].max()
         db_df = get_wave_data(self.wind_farm_code + '_wave', min_date, max_date)
         db_df = get_wave_data(self.wind_farm_code + '_wave', min_date, max_date)
 
 
@@ -44,13 +72,17 @@ class WaveTrans(object):
             delete_exist_wave_data(self.wind_farm_code + "_wave", ids)
             delete_exist_wave_data(self.wind_farm_code + "_wave", ids)
 
 
     def run(self):
     def run(self):
+        """运行波形数据转换"""
         update_trans_status_running(self.id)
         update_trans_status_running(self.id)
         trance_id = '-'.join([self.wind_farm_code, 'wave'])
         trance_id = '-'.join([self.wind_farm_code, 'wave'])
         set_trance_id(trance_id)
         set_trance_id(trance_id)
         all_files = read_files(self.read_dir, ['txt', 'csv'])
         all_files = read_files(self.read_dir, ['txt', 'csv'])
         update_trans_transfer_progress(self.id, 5)
         update_trans_transfer_progress(self.id, 5)
+
         # 最大取系统cpu的 1/2
         # 最大取系统cpu的 1/2
         split_count = get_available_cpu_count_with_percent(1 / 2)
         split_count = get_available_cpu_count_with_percent(1 / 2)
+        # 限制最大进程数
+        split_count = min(split_count, ParallelProcessing.MAX_PROCESSES)
 
 
         all_wind, _ = get_all_wind(self.wind_farm_code, False)
         all_wind, _ = get_all_wind(self.wind_farm_code, False)
 
 
@@ -58,11 +90,11 @@ class WaveTrans(object):
 
 
         wave_conf = get_wave_conf(self.wind_farm_code)
         wave_conf = get_wave_conf(self.wind_farm_code)
 
 
-        base_param_exec = wave_conf['base_param_exec']
+        base_param_exec = wave_conf.get('base_param_exec', '')
         map_dict = {}
         map_dict = {}
         if base_param_exec:
         if base_param_exec:
             base_param_exec = base_param_exec.replace('\r\n', '\n').replace('\t', '    ')
             base_param_exec = base_param_exec.replace('\r\n', '\n').replace('\t', '    ')
-            trans_print(base_param_exec)
+            info(base_param_exec)
             if 'import ' in base_param_exec:
             if 'import ' in base_param_exec:
                 raise Exception("方法不支持import方法")
                 raise Exception("方法不支持import方法")
 
 
@@ -72,23 +104,26 @@ class WaveTrans(object):
 
 
         wind_turbine_name_set = set()
         wind_turbine_name_set = set()
 
 
-        all_array = split_array(all_files, split_count * 10)
+        # 优化批次大小
+        batch_size = split_count * 10
+        all_array = split_array(all_files, batch_size)
         total_index = len(all_array)
         total_index = len(all_array)
+
         for index, now_array in enumerate(all_array):
         for index, now_array in enumerate(all_array):
             index_begin = datetime.datetime.now()
             index_begin = datetime.datetime.now()
             with multiprocessing.Pool(split_count) as pool:
             with multiprocessing.Pool(split_count) as pool:
                 try:
                 try:
                     file_datas = pool.starmap(self.get_data_exec,
                     file_datas = pool.starmap(self.get_data_exec,
                                               [(base_param_exec, i, list(map_dict.keys())) for i in now_array])
                                               [(base_param_exec, i, list(map_dict.keys())) for i in now_array])
-                    trans_print(f'总数:{len(now_array)},返回个数{len(file_datas)}')
+                    info(f'总数:{len(now_array)},返回个数{len(file_datas)}')
                 except Exception as e:
                 except Exception as e:
                     message = str(e)
                     message = str(e)
-                    trans_print(traceback.format_exc())
+                    error(traceback.format_exc())
                     update_trans_status_error(self.id, message[0:len(message) if len(message) < 100 else 100])
                     update_trans_status_error(self.id, message[0:len(message) if len(message) < 100 else 100])
                     raise e
                     raise e
 
 
             update_trans_transfer_progress(self.id, 20 + int(index / total_index * 60))
             update_trans_transfer_progress(self.id, 20 + int(index / total_index * 60))
-            trans_print("读取文件耗时:", datetime.datetime.now() - self.begin)
+            info("读取文件耗时:", datetime.datetime.now() - self.begin)
 
 
             result_list = list()
             result_list = list()
             for file_data in file_datas:
             for file_data in file_datas:
@@ -96,7 +131,7 @@ class WaveTrans(object):
                     wind_turbine_name, time_stamp, sampling_frequency, rotational_speed, mesure_point_name, type, mesure_data = \
                     wind_turbine_name, time_stamp, sampling_frequency, rotational_speed, mesure_point_name, type, mesure_data = \
                         file_data[0], file_data[1], file_data[2], file_data[3], file_data[4], file_data[5], file_data[6]
                         file_data[0], file_data[1], file_data[2], file_data[3], file_data[4], file_data[5], file_data[6]
 
 
-                    if mesure_point_name in map_dict.keys():
+                    if mesure_point_name in map_dict:
                         wind_turbine_name_set.add(wind_turbine_name)
                         wind_turbine_name_set.add(wind_turbine_name)
                         if self.min_date is None or self.min_date > time_stamp:
                         if self.min_date is None or self.min_date > time_stamp:
                             self.min_date = time_stamp
                             self.min_date = time_stamp
@@ -109,7 +144,7 @@ class WaveTrans(object):
                              mesure_data])
                              mesure_data])
 
 
             if result_list:
             if result_list:
-                self.data_count = self.data_count + len(result_list)
+                self.data_count += len(result_list)
                 df = pd.DataFrame(result_list,
                 df = pd.DataFrame(result_list,
                                   columns=['wind_turbine_name', 'time_stamp', 'rotational_speed', 'sampling_frequency',
                                   columns=['wind_turbine_name', 'time_stamp', 'rotational_speed', 'sampling_frequency',
                                            'mesure_point_name', 'type', 'mesure_data'])
                                            'mesure_point_name', 'type', 'mesure_data'])
@@ -118,16 +153,16 @@ class WaveTrans(object):
                 df.dropna(subset=['mesure_point_name'], inplace=True)
                 df.dropna(subset=['mesure_point_name'], inplace=True)
                 df['wind_turbine_number'] = df['wind_turbine_name'].map(all_wind).fillna(df['wind_turbine_name'])
                 df['wind_turbine_number'] = df['wind_turbine_name'].map(all_wind).fillna(df['wind_turbine_name'])
 
 
+                # 批量处理JSON序列化
                 df['mesure_data'] = df['mesure_data'].apply(lambda x: json.dumps(x))
                 df['mesure_data'] = df['mesure_data'].apply(lambda x: json.dumps(x))
 
 
                 df.sort_values(by=['time_stamp', 'mesure_point_name'], inplace=True)
                 df.sort_values(by=['time_stamp', 'mesure_point_name'], inplace=True)
                 # self.del_exists_data(df)
                 # self.del_exists_data(df)
                 save_df_to_db(self.wind_farm_code + '_wave', df, batch_count=400)
                 save_df_to_db(self.wind_farm_code + '_wave', df, batch_count=400)
-            trans_print(f"总共{total_index}组,当前{index + 1}", "本次写入耗时:", datetime.datetime.now() - index_begin,
-                        "总耗时:", datetime.datetime.now() - self.begin)
+            info(f"总共{total_index}组,当前{index + 1}", "本次写入耗时:", datetime.datetime.now() - index_begin,
+                 "总耗时:", datetime.datetime.now() - self.begin)
 
 
-        update_trans_status_success(self.id, len(wind_turbine_name_set), None,
+        update_trans_status_success(self.id, len(wind_turbine_name_set), Types.WAVE,
                                     self.min_date, self.max_date, self.data_count)
                                     self.min_date, self.max_date, self.data_count)
 
 
-        # update_trans_status_success(self.id)
-        trans_print("总耗时:", datetime.datetime.now() - self.begin)
+        info("总耗时:", datetime.datetime.now() - self.begin)

+ 3 - 3
service/common_connect.py

@@ -1,5 +1,5 @@
-from utils.db.ConnectMysql import ConnectMysql
+from utils.db.ConnectMysql import MySQLDatabase
 
 
-plt = ConnectMysql("plt")
+plt = MySQLDatabase("plt")
 
 
-trans = ConnectMysql("trans")
+trans = MySQLDatabase("trans")

+ 6 - 2
service/trans_conf_service.py

@@ -4,6 +4,7 @@
 from datetime import datetime
 from datetime import datetime
 
 
 from service.common_connect import trans
 from service.common_connect import trans
+from utils.log.trans_log import info
 
 
 
 
 def update_timeout_trans_data():
 def update_timeout_trans_data():
@@ -46,6 +47,7 @@ def update_trans_status_error(id, message="", save_db=True):
 
 
         message = message if len(message) <= 200 else message[0:200]
         message = message if len(message) <= 200 else message[0:200]
         trans.execute(exec_sql, (message, id))
         trans.execute(exec_sql, (message, id))
+    info("执行失败:", message)
 
 
 
 
 def update_trans_status_success(id, wind_count=0, time_granularity=0,
 def update_trans_status_success(id, wind_count=0, time_granularity=0,
@@ -70,14 +72,16 @@ def update_trans_status_success(id, wind_count=0, time_granularity=0,
             trans.execute(exec_sql, (wind_count, time_granularity, id))
             trans.execute(exec_sql, (wind_count, time_granularity, id))
 
 
 
 
-def update_trans_transfer_progress(id,  transfer_progress=0, save_db=True):
-    print(id,  transfer_progress)
+def update_trans_transfer_progress(id, transfer_progress=0, save_db=True):
+    print(id, transfer_progress)
     if save_db:
     if save_db:
         exec_sql = """
         exec_sql = """
         update data_transfer set transfer_progress =%s where id = %s 
         update data_transfer set transfer_progress =%s where id = %s 
         """
         """
         trans.execute(exec_sql, (int(transfer_progress), id))
         trans.execute(exec_sql, (int(transfer_progress), id))
 
 
+    info('当前进度:', transfer_progress)
+
 
 
 def get_now_running_count():
 def get_now_running_count():
     query_running_sql = """
     query_running_sql = """

+ 95 - 72
service/trans_service.py

@@ -9,53 +9,65 @@ import pandas as pd
 from service.common_connect import trans
 from service.common_connect import trans
 from service.trans_conf_service import create_wave_table
 from service.trans_conf_service import create_wave_table
 from utils.file.trans_methods import split_array
 from utils.file.trans_methods import split_array
-from utils.log.trans_log import trans_print
+from utils.log.trans_log import info, error
 
 
 
 
-def get_min_sec_conf(field_code, trans_type) -> dict:
-    query_sql = "SELECT * FROM trans_conf where wind_code = %s and type = %s and status = 1"
-    res = trans.execute(query_sql, (field_code, trans_type))
+def get_config(table_name, field_code, trans_type=None, field_name='wind_code', status=1) -> dict:
+    """
+    通用配置获取函数
+    
+    Args:
+        table_name: 表名
+        field_code: 字段值
+        trans_type: 类型参数
+        field_name: 字段名,默认为wind_code
+        status: 状态值,默认为1
+        
+    Returns:
+        配置字典
+    """
+    if table_name == 'warn_fault_conf':
+        types = list()
+        if trans_type == 'fault':
+            types.append(1)
+        elif trans_type == 'warn':
+            types.append(2)
+        else:
+            error(f"未找到{trans_type}告警/故障的配置")
+            raise ValueError(f"未找到{trans_type}告警/故障的配置")
+        types.append(3)
+        query_sql = f"SELECT * FROM {table_name} where {field_name} = %s and type in %s and status = %s"
+        params = (field_code, types, status)
+    elif table_name == 'trans_conf' and field_name == 'wind_name':
+        query_sql = f"SELECT * FROM {table_name} where {field_name} = %s and type = %s and status = %s"
+        params = (field_code, trans_type, status)
+    elif table_name == 'trans_conf':
+        query_sql = f"SELECT * FROM {table_name} where {field_name} = %s and type = %s and status = %s"
+        params = (field_code, trans_type, status)
+    else:
+        query_sql = f"SELECT * FROM {table_name} where {field_name} = %s and status = %s"
+        params = (field_code, status)
+
+    res = trans.execute(query_sql, params)
     if type(res) == tuple or type(res) == str:
     if type(res) == tuple or type(res) == str:
         return None
         return None
     return res[0]
     return res[0]
 
 
 
 
-def get_min_sec_conf_test(field_code, trans_type) -> dict:
-    query_sql = "SELECT * FROM trans_conf where wind_name = %s and type = %s and status = 1"
-    res = trans.execute(query_sql, (field_code, trans_type))
-    print(res)
-    if type(res) == tuple or type(res) == str:
-        return None
-    return res[0]
+def get_min_sec_conf(field_code, trans_type) -> dict:
+    return get_config('trans_conf', field_code, trans_type)
 
 
 
 
-def get_fault_warn_conf(field_code, trans_type) -> dict:
-    types = list()
-    if trans_type == 'fault':
-        types.append(1)
-    elif trans_type == 'warn':
-        types.append(2)
-    else:
-        trans_print(f"未找到{trans_type}告警/故障的配置")
-        raise ValueError(f"未找到{trans_type}告警/故障的配置")
+def get_min_sec_conf_test(field_code, trans_type) -> dict:
+    return get_config('trans_conf', field_code, trans_type, field_name='wind_name')
 
 
-    types.append(3)
 
 
-    query_sql = "SELECT * FROM warn_fault_conf where wind_code = %s and type in %s and status = 1"
-    res = trans.execute(query_sql, (field_code, types))
-    print(res)
-    if type(res) == tuple or type(res) == str:
-        return None
-    return res[0]
+def get_fault_warn_conf(field_code, trans_type) -> dict:
+    return get_config('warn_fault_conf', field_code, trans_type)
 
 
 
 
 def get_wave_conf(field_code) -> dict:
 def get_wave_conf(field_code) -> dict:
-    query_sql = "SELECT * FROM wave_conf where wind_code = %s and status = 1"
-    res = trans.execute(query_sql, (field_code))
-    print(res)
-    if type(res) == tuple or type(res) == str:
-        return None
-    return res[0]
+    return get_config('wave_conf', field_code)
 
 
 
 
 def creat_min_sec_table(table_name, trans_type, wind_farm_name='', use_tidb=False):
 def creat_min_sec_table(table_name, trans_type, wind_farm_name='', use_tidb=False):
@@ -64,7 +76,7 @@ def creat_min_sec_table(table_name, trans_type, wind_farm_name='', use_tidb=Fals
     """
     """
     count = trans.execute(exists_table_sql)[0]['count']
     count = trans.execute(exists_table_sql)[0]['count']
     if count > 0:
     if count > 0:
-        trans_print(f"{table_name}已存在")
+        info(f"{table_name}已存在")
 
 
     if trans_type == 'second':
     if trans_type == 'second':
         add_key = 'KEY `year_month` (`year_month`)'
         add_key = 'KEY `year_month` (`year_month`)'
@@ -197,52 +209,63 @@ def drop_exists_data(table_name, wind_turbine_number, min_date, max_date):
     """
     """
 
 
     count = trans.execute(sql)
     count = trans.execute(sql)
-    trans_print(f"删除数据{count}条,{table_name},{wind_turbine_number},{min_date},{max_date}")
-
+    info(f"删除数据{count}条,{table_name},{wind_turbine_number},{min_date},{max_date}")
 
 
-def save_scada_file_to_db(table_name, file: str, wind_turbine_number, date_str, batch_count=100000, use_tidb=False):
-    base_name = path.basename(file)
-    df = pd.read_csv(file)
-    # if use_tidb:
-    #     min_date = df['time_stamp'].min()
-    #     max_date = df['time_stamp'].max()
-    #     # drop_exists_data(table_name, wind_turbine_number, min_date, max_date)
-    # else:
-    #     add_or_remove_partation(table_name, date_str, wind_turbine_number)
-
-    add_or_remove_partation(table_name, date_str, wind_turbine_number)
 
 
+def save_data_to_db(table_name: str, data, batch_count=100000, wind_turbine_number=None, date_str=None, file_name=None):
+    """
+    通用数据保存函数
+    
+    Args:
+        table_name: 表名
+        data: 数据,可以是DataFrame或文件路径
+        batch_count: 批处理大小
+        wind_turbine_number: 风机编号
+        date_str: 日期字符串
+        file_name: 文件名
+        
+    Returns:
+        None
+    """
     try:
     try:
-        trans_print(f"保存{table_name},{base_name},{wind_turbine_number},数据:{df.shape[0]}")
-        trans.execute_df_save(df, table_name, batch_count)
-        trans_print(f"保存到{table_name},{base_name},{wind_turbine_number} 成功,总条数:{df.shape[0]}")
+        # 处理数据
+        if isinstance(data, str):
+            # 从文件读取数据
+            df = pd.read_csv(data)
+            file_name = file_name or path.basename(data)
+        else:
+            # 直接使用DataFrame
+            df = data
+
+        # 处理分区
+        if wind_turbine_number and date_str:
+            add_or_remove_partation(table_name, date_str, wind_turbine_number)
+
+        # 保存数据
+        if wind_turbine_number:
+            trans.execute_df_save(df, table_name, batch_count)
+            info(f"保存到{table_name},{file_name},{wind_turbine_number} 成功,总条数:{df.shape[0]}")
+        else:
+            trans.execute_df_save(df, table_name, batch_count)
+            info(f"保存到{table_name}成功,总条数:{df.shape[0]}")
     except Exception as e:
     except Exception as e:
-        trans_print(traceback.format_exc())
-        message = base_name + str(e)
+        if file_name:
+            message = file_name + str(e)
+        else:
+            message = str(e)
         raise Exception(message)
         raise Exception(message)
 
 
 
 
+def save_scada_file_to_db(table_name, file: str, wind_turbine_number, date_str, batch_count=100000, use_tidb=False):
+    save_data_to_db(table_name, file, batch_count, wind_turbine_number, date_str)
+
+
 def save_file_to_db(table_name: str, file: str, batch_count=100000):
 def save_file_to_db(table_name: str, file: str, batch_count=100000):
-    base_name = path.basename(file)
-    try:
-        df = pd.read_csv(file)
-        trans_print(f"保存{table_name},总条数:{df.shape[0]}")
-        trans.execute_df_save(df, table_name, batch_count)
-        trans_print(f"保存到{table_name}成功,总条数:{df.shape[0]}")
-    except Exception as e:
-        trans_print(traceback.format_exc())
-        message = base_name + str(e)
-        raise Exception(message)
+    save_data_to_db(table_name, file, batch_count)
 
 
 
 
-def save_df_to_db(table_name: str, df: pd.DataFrame(), batch_count=100000):
-    try:
-        trans_print(f"保存{table_name},总条数:{df.shape[0]}")
-        trans.execute_df_save(df, table_name, batch_count)
-        trans_print(f"保存到{table_name}成功,总条数:{df.shape[0]}")
-    except Exception as e:
-        trans_print(traceback.format_exc())
-        raise Exception(str(e))
+def save_df_to_db(table_name: str, df: pd.DataFrame, batch_count=100000):
+    save_data_to_db(table_name, df, batch_count)
 
 
 
 
 def batch_statistics(table_name):
 def batch_statistics(table_name):
@@ -251,7 +274,7 @@ def batch_statistics(table_name):
         res = trans.execute(query_sql)
         res = trans.execute(query_sql)
         return res[0]
         return res[0]
     except:
     except:
-        trans_print(traceback.format_exc())
+        error(traceback.format_exc())
         return None
         return None
 
 
 
 
@@ -319,7 +342,7 @@ def get_trans_exec_code(id, query_type):
     if type(res) == tuple or type(res) == str:
     if type(res) == tuple or type(res) == str:
         return None
         return None
     exec_code = res[0]['exec_code']
     exec_code = res[0]['exec_code']
-    trans_print("任务ID", id, '类型', type, '获取到执行代码:', exec_code)
+    info("任务ID", id, '类型', type, '获取到执行代码:', exec_code)
     return exec_code
     return exec_code
 
 
 
 

+ 4 - 2
utils/common.py

@@ -1,3 +1,5 @@
-excel_types = ['xls', 'xlsx', 'xlsm', 'xlsb', 'odf', 'ods', 'csv', 'csv.gz']
+from conf.constants import FileTypes
 
 
-zip_types = ['rar', 'zip']
+excel_types = FileTypes.EXCEL_TYPES
+
+zip_types = FileTypes.ZIP_TYPES

+ 137 - 12
utils/conf/read_conf.py

@@ -1,22 +1,147 @@
 # -*- coding: utf-8 -*-
 # -*- coding: utf-8 -*-
 # @Time    : 2024/6/7
 # @Time    : 2024/6/7
 # @Author  : 魏志亮
 # @Author  : 魏志亮
+import os
 
 
 import yaml
 import yaml
+from typing import Any, Optional, Dict
 
 
 
 
-def yaml_conf(path, encoding='utf-8'):
-    with open(path, 'r', encoding=encoding) as f:
-        data = yaml.safe_load(f)
-    return data
+def load_yaml_config(file_path: str, encoding: str = 'utf-8') -> Dict[str, Any]:
+    """
+    加载YAML配置文件
+    
+    Args:
+        file_path: YAML文件路径
+        encoding: 文件编码,默认为utf-8
+        
+    Returns:
+        解析后的配置字典
+        
+    Raises:
+        FileNotFoundError: 文件不存在时抛出
+        yaml.YAMLError: YAML解析错误时抛出
+    """
+    try:
+        with open(file_path, 'r', encoding=encoding) as f:
+            data = yaml.safe_load(f)
+            # 确保返回字典类型,防止YAML文件为空时返回None
+            return data if isinstance(data, dict) else {}
+    except FileNotFoundError:
+        raise FileNotFoundError(f"配置文件不存在: {file_path}")
+    except yaml.YAMLError as e:
+        raise yaml.YAMLError(f"YAML解析错误: {e}")
 
 
 
 
-def read_conf(dict_conf, col, default_value=None):
-    if col in dict_conf:
-        res = dict_conf[col]
-        if res is None and default_value is not None:
-            return default_value
-        return res
-    else:
-        return default_value
+def get_config_value(config: Dict[str, Any], key: str, default: Optional[Any] = None) -> Any:
+    """
+    从配置字典中安全地获取值
+    
+    Args:
+        config: 配置字典
+        key: 配置键名
+        default: 默认值,当键不存在或值为None时返回
+        
+    Returns:
+        配置值或默认值
+    """
+    # 处理config为None的情况
+    if config is None:
+        return default
+    
+    # 支持嵌套键,如 "database.host"
+    keys = key.split('.')
+    value = config
+    
+    for k in keys:
+        if isinstance(value, dict) and k in value:
+            value = value[k]
+        else:
+            value = None
+            break
+    
+    # 如果值为None且提供了默认值,返回默认值
+    if value is None and default is not None:
+        return default
+    
+    return value
 
 
+
+def merge_configs(base_config: Dict[str, Any], override_config: Dict[str, Any]) -> Dict[str, Any]:
+    """
+    合并配置字典
+    
+    Args:
+        base_config: 基础配置
+        override_config: 覆盖配置
+        
+    Returns:
+        合并后的配置
+    """
+    result = base_config.copy()
+    
+    for key, value in override_config.items():
+        if key in result and isinstance(result[key], dict) and isinstance(value, dict):
+            # 递归合并嵌套字典
+            result[key] = merge_configs(result[key], value)
+        else:
+            # 直接覆盖
+            result[key] = value
+    
+    return result
+
+
+def load_config_with_env(file_path: str, encoding: str = 'utf-8') -> Dict[str, Any]:
+    """
+    加载配置文件并支持环境变量覆盖
+    
+    Args:
+        file_path: YAML文件路径
+        encoding: 文件编码,默认为utf-8
+        
+    Returns:
+        解析后的配置字典
+    """
+    # 加载基础配置
+    base_config = load_yaml_config(file_path, encoding)
+    
+    # 检查是否有环境变量覆盖
+    env_prefix = "ETL_"
+    override_config = {}
+    
+    for key, value in os.environ.items():
+        if key.startswith(env_prefix):
+            # 转换环境变量名到配置键名
+            config_key = key[len(env_prefix):].lower().replace('_', '.')
+            
+            # 解析值
+            if value.lower() == 'true':
+                parsed_value = True
+            elif value.lower() == 'false':
+                parsed_value = False
+            elif value.isdigit():
+                parsed_value = int(value)
+            elif '.' in value and all(part.isdigit() for part in value.split('.')):
+                parsed_value = float(value)
+            else:
+                parsed_value = value
+            
+            # 构建嵌套配置
+            keys = config_key.split('.')
+            current = override_config
+            for k in keys[:-1]:
+                if k not in current:
+                    current[k] = {}
+                current = current[k]
+            current[keys[-1]] = parsed_value
+    
+    # 合并配置
+    if override_config:
+        base_config = merge_configs(base_config, override_config)
+    
+    return base_config
+
+
+# 为了保持向后兼容,保留原函数名(可选)
+yaml_conf = load_yaml_config
+read_conf = get_config_value

+ 231 - 41
utils/db/ConnectMysql.py

@@ -1,56 +1,246 @@
+# -*- coding: utf-8 -*-
+# @Time    : 2024/6/7
+# @Author  : 魏志亮
+
+import os
 import traceback
 import traceback
-from os import *
+from typing import Any, Dict, List, Tuple, Union
 
 
 import pandas as pd
 import pandas as pd
 import pymysql
 import pymysql
 from pymysql.cursors import DictCursor
 from pymysql.cursors import DictCursor
 from sqlalchemy import create_engine
 from sqlalchemy import create_engine
+from sqlalchemy.engine import Engine
+
+from utils.conf.read_conf import load_yaml_config
+from utils.log.trans_log import error, info, debug
+
+
+class MySQLDatabase:
+    """MySQL数据库连接管理类"""
+
+    # 类级别的引擎缓存,避免重复创建
+    _engine_cache = {}
+
+    def __init__(self, connection_name: str):
+        """
+        初始化MySQL数据库连接
+        
+        Args:
+            connection_name: 配置文件中对应的连接名称
+        """
+        # 获取配置文件路径
+        config_path = os.environ.get('ETL_CONF')
+        if not config_path:
+            raise ValueError("环境变量 ETL_CONF 未设置")
+
+        # 加载配置
+        self.yaml_data = load_yaml_config(config_path)
+        self.connection_name = connection_name
+
+        # 验证配置是否存在
+        if connection_name not in self.yaml_data:
+            raise KeyError(f"配置中不存在连接名称: {connection_name}")
+
+        self.config = self.yaml_data[connection_name]
+        self.database = self.config.get('database', '')
+
+        # 验证必要配置项
+        required_keys = ['host', 'user', 'password', 'database']
+        missing_keys = [key for key in required_keys if key not in self.config]
+        if missing_keys:
+            raise KeyError(f"连接配置缺少必要项: {missing_keys}")
+
+    def get_connection(self) -> pymysql.Connection:
+        """
+        从连接池中获取一个连接
+        
+        Returns:
+            pymysql连接对象
+        """
+        # 创建连接配置副本,避免修改原配置
+        conn_config = self.config.copy()
+        # 移除可能不需要的配置项(如果有)
+        conn_config.pop('charset', None)  # pymysql连接时charset参数可能会冲突
+
+        return pymysql.connect(
+            cursorclass=DictCursor,
+            charset='utf8mb4',
+            **conn_config
+        )
+
+    def execute_query(self, sql: str, params: Union[Tuple, List, Dict] = None) -> List[Dict[str, Any]]:
+        """
+        执行SQL查询并返回结果
+        
+        Args:
+            sql: SQL语句
+            params: SQL参数,可以是元组、列表或字典
+            
+        Returns:
+            查询结果列表,每个元素为字典形式
+            
+        Raises:
+            Exception: SQL执行错误时抛出
+        """
+        params = params or ()
+        conn = None
+        cursor = None
+
+        try:
+            conn = self.get_connection()
+            cursor = conn.cursor()
+
+            # 执行SQL
+            cursor.execute(sql, params)
+            debug("开始执行SQL:\n", cursor._executed)
 
 
-from utils.conf.read_conf import yaml_conf
-from utils.log.trans_log import trans_print
+            # 提交事务
+            conn.commit()
 
 
+            # 获取结果
+            result = cursor.fetchall()
+            return result
 
 
-class ConnectMysql:
+        except Exception as e:
+            error(f"执行SQL出错: {sql}")
+            error(f"错误信息: {e}")
+            error(traceback.format_exc())
 
 
-    def __init__(self, connet_name):
-        self.yaml_data = yaml_conf(environ.get('ETL_CONF'))
-        self.connet_name = connet_name
-        self.config = self.yaml_data[self.connet_name]
-        self.database = self.config['database']
+            if conn:
+                conn.rollback()
+            raise e
 
 
-    # 从连接池中获取一个连接
-    def get_conn(self):
-        return pymysql.connect(**self.config)
+        finally:
+            # 确保资源被释放
+            if cursor:
+                cursor.close()
+            if conn:
+                conn.close()
 
 
-    # 使用连接执行sql
-    def execute(self, sql, params=tuple()):
+    def execute_update(self, sql: str, params: Union[Tuple, List, Dict] = None) -> int:
+        """
+        执行更新操作(INSERT, UPDATE, DELETE)
+        
+        Args:
+            sql: SQL语句
+            params: SQL参数
+            
+        Returns:
+            影响的行数
+        """
+        params = params or ()
+        conn = None
+        cursor = None
 
 
-        with self.get_conn() as conn:
-            with conn.cursor(cursor=DictCursor) as cursor:
-                try:
-                    cursor.execute(sql, params)
-                    trans_print("开始执行SQL:", cursor._executed)
-                    conn.commit()
-                    result = cursor.fetchall()
-                    return result
-                except Exception as e:
-                    trans_print(f"执行sql:{sql},报错:{e}")
-                    trans_print(traceback.format_exc())
-                    conn.rollback()
-                    raise e
+        try:
+            conn = self.get_connection()
+            cursor = conn.cursor()
 
 
-    def get_engine(self):
+            cursor.execute(sql, params)
+            debug("开始执行SQL:", cursor._executed)
+
+            conn.commit()
+            return cursor.rowcount
+
+        except Exception as e:
+            error(f"执行更新SQL出错: {sql}")
+            error(f"错误信息: {e}")
+            error(traceback.format_exc())
+
+            if conn:
+                conn.rollback()
+            raise e
+
+        finally:
+            if cursor:
+                cursor.close()
+            if conn:
+                conn.close()
+
+    def get_engine(self) -> Engine:
+        """
+        获取SQLAlchemy引擎,使用缓存避免重复创建
+        
+        Returns:
+            SQLAlchemy引擎对象
+        """
+        # 构建缓存键
         config = self.config
         config = self.config
-        username = config['user']
-        password = config['password']
-        host = config['host']
-        port = config['port']
-        dbname = config['database']
-        return create_engine(f'mysql+pymysql://{username}:{password}@{host}:{port}/{dbname}')
-
-    def execute_df_save(self, df, table_name, chunk_size=10000):
-        df.to_sql(table_name, self.get_engine(), index=False, if_exists='append', chunksize=chunk_size)
-
-    def read_sql_to_df(self, sql):
-        df = pd.read_sql_query(sql, self.get_engine())
-        return df
+        cache_key = f"{config['host']}:{config['port']}:{config['user']}:{config['database']}"
+
+        # 检查缓存中是否已有引擎
+        if cache_key not in self._engine_cache:
+            username = config['user']
+            password = config['password']
+            host = config['host']
+            port = config['port']
+            dbname = config['database']
+
+            # 构建连接URL
+            connection_url = f'mysql+pymysql://{username}:{password}@{host}:{port}/{dbname}?charset=utf8mb4'
+
+            # 创建引擎并缓存
+            self._engine_cache[cache_key] = create_engine(
+                connection_url,
+                pool_size=10,  # 增加连接池大小
+                pool_recycle=3600,
+                pool_pre_ping=True,  # 连接池预ping,确保连接有效
+                echo=False  # 设置为True可打印SQL日志
+            )
+
+        return self._engine_cache[cache_key]
+
+    def save_dataframe(self, df: pd.DataFrame, table_name: str, chunk_size: int = 10000,
+                       if_exists: str = 'append') -> None:
+        """
+        将DataFrame保存到数据库表
+        
+        Args:
+            df: pandas DataFrame对象
+            table_name: 目标表名
+            chunk_size: 每批写入的行数
+            if_exists: 表存在时的处理方式:'fail', 'replace', 'append'
+        """
+        try:
+            df.to_sql(
+                table_name,
+                self.get_engine(),
+                index=False,
+                if_exists=if_exists,
+                chunksize=chunk_size,
+                method='multi'  # 使用多值插入提高性能
+            )
+            info(f"成功保存 {len(df)} 条数据到表 {table_name}")
+
+        except Exception as e:
+            error(f"保存DataFrame到表 {table_name} 失败: {e}")
+            error(traceback.format_exc())
+            raise e
+
+    def read_sql_to_dataframe(self, sql: str) -> pd.DataFrame:
+        """
+        执行SQL查询并返回DataFrame
+        
+        Args:
+            sql: SQL查询语句
+            
+        Returns:
+            查询结果的DataFrame
+        """
+        try:
+            df = pd.read_sql_query(sql, self.get_engine())
+            debug(f"查询返回 {len(df)} 行数据")
+            return df
+
+        except Exception as e:
+            error(f"执行SQL查询失败: {sql}")
+            error(f"错误信息: {e}")
+            error(traceback.format_exc())
+            raise e
+
+    # 为了保持向后兼容,保留原方法名(可选)
+    get_conn = get_connection
+    execute = execute_query
+    execute_df_save = save_dataframe
+    read_sql_to_df = read_sql_to_dataframe

+ 6 - 6
utils/db/ConnectMysql_tidb_fix.py

@@ -8,7 +8,7 @@ from pymysql.cursors import DictCursor
 from sqlalchemy import create_engine
 from sqlalchemy import create_engine
 
 
 from utils.conf.read_conf import yaml_conf
 from utils.conf.read_conf import yaml_conf
-from utils.log.trans_log import trans_print
+from utils.log.trans_log import error, debug
 
 
 
 
 class ConnectMysql:
 class ConnectMysql:
@@ -30,13 +30,13 @@ class ConnectMysql:
             with conn.cursor(cursor=DictCursor) as cursor:
             with conn.cursor(cursor=DictCursor) as cursor:
                 try:
                 try:
                     cursor.execute(sql, params)
                     cursor.execute(sql, params)
-                    trans_print("开始执行SQL:", cursor._executed)
+                    debug("开始执行SQL:", cursor._executed)
                     conn.commit()
                     conn.commit()
                     result = cursor.fetchall()
                     result = cursor.fetchall()
                     return result
                     return result
                 except Exception as e:
                 except Exception as e:
-                    trans_print(f"执行sql:{sql},报错:{e}")
-                    trans_print(traceback.format_exc())
+                    error(f"执行sql:{sql},报错:{e}")
+                    error(traceback.format_exc())
                     conn.rollback()
                     conn.rollback()
                     raise e
                     raise e
 
 
@@ -66,10 +66,10 @@ class ConnectMysql:
                     df.to_sql(table_name, engine, if_exists='append', index=False, chunksize=chunksize)
                     df.to_sql(table_name, engine, if_exists='append', index=False, chunksize=chunksize)
                 except Exception as e:
                 except Exception as e:
                     retry_count += 1
                     retry_count += 1
-                    trans_print(f" 第 {retry_count} 次重试, 错误: {str(e)}")
+                    error(f" 第 {retry_count} 次重试, 错误: {str(e)}")
                     time.sleep(5 * retry_count)  # 指数退避
                     time.sleep(5 * retry_count)  # 指数退避
                     if retry_count == max_retries:
                     if retry_count == max_retries:
-                        trans_print(f"处理失败: {str(e)}")
+                        error(f"处理失败: {str(e)}")
                         raise
                         raise
         except Exception as e:
         except Exception as e:
             engine.dispose()
             engine.dispose()

+ 2 - 3
utils/df_utils/util.py

@@ -6,7 +6,7 @@ import datetime
 import pandas as pd
 import pandas as pd
 
 
 
 
-def get_time_space(df, time_str):
+def estimate_time_interval(df, time_str):
     """
     """
     :return: 查询时间间隔
     :return: 查询时间间隔
     """
     """
@@ -15,7 +15,6 @@ def get_time_space(df, time_str):
     df1['chazhi'] = df1[time_str].shift(-1) - df1[time_str]
     df1['chazhi'] = df1[time_str].shift(-1) - df1[time_str]
     result = df1.sample(int(df1.shape[0] / 100))['chazhi'].value_counts().idxmax().seconds
     result = df1.sample(int(df1.shape[0] / 100))['chazhi'].value_counts().idxmax().seconds
     del df1
     del df1
-    print(datetime.datetime.now() - begin)
     return result
     return result
 
 
 
 
@@ -46,7 +45,7 @@ def calculate_time_difference(now: datetime.datetime, date: datetime.datetime):
 if __name__ == '__main__':
 if __name__ == '__main__':
     df = pd.read_csv(r"D:\data\清理数据\密马风电场\test_11_test\minute\WOG00469.csv")
     df = pd.read_csv(r"D:\data\清理数据\密马风电场\test_11_test\minute\WOG00469.csv")
     df['time_stamp'] = pd.to_datetime(df['time_stamp'])
     df['time_stamp'] = pd.to_datetime(df['time_stamp'])
-    space = get_time_space(df, 'time_stamp')
+    space = estimate_time_interval(df, 'time_stamp')
     min = df['time_stamp'].min()
     min = df['time_stamp'].min()
     max = df['time_stamp'].max()
     max = df['time_stamp'].max()
     result = get_time_space_count(min, max, space)
     result = get_time_space_count(min, max, space)

+ 139 - 38
utils/file/trans_methods.py

@@ -6,25 +6,35 @@ import datetime
 import os
 import os
 import shutil
 import shutil
 import warnings
 import warnings
+from typing import List, Dict, Optional
 
 
 import chardet
 import chardet
 import pandas as pd
 import pandas as pd
 
 
-from utils.common import excel_types, zip_types
-from utils.log.trans_log import trans_print
+from conf.constants import FileTypes
+from utils.log.trans_log import error, debug
 
 
 warnings.filterwarnings("ignore")
 warnings.filterwarnings("ignore")
 
 
 
 
 # 获取文件编码
 # 获取文件编码
-def detect_file_encoding(filename):
+def detect_file_encoding(filename: str) -> str:
+    """
+    检测文件编码
+    
+    Args:
+        filename: 文件路径
+    
+    Returns:
+        检测到的编码
+    """
     # 读取文件的前1000个字节(足够用于大多数编码检测)
     # 读取文件的前1000个字节(足够用于大多数编码检测)
     with open(filename, 'rb') as f:
     with open(filename, 'rb') as f:
         rawdata = f.read(1000)
         rawdata = f.read(1000)
     result = chardet.detect(rawdata)
     result = chardet.detect(rawdata)
     encoding = result['encoding']
     encoding = result['encoding']
 
 
-    trans_print("文件类型:", filename, encoding)
+    debug("文件类型:", filename, encoding)
 
 
     if encoding is None:
     if encoding is None:
         encoding = 'gb18030'
         encoding = 'gb18030'
@@ -35,19 +45,52 @@ def detect_file_encoding(filename):
     return 'gb18030'
     return 'gb18030'
 
 
 
 
-def del_blank(df=pd.DataFrame(), cols=list()):
+def del_blank(df: pd.DataFrame = pd.DataFrame(), cols: Optional[List[str]] = None) -> pd.DataFrame:
+    """
+    删除指定列的空白字符
+    
+    Args:
+        df: 数据帧
+        cols: 要处理的列列表
+    
+    Returns:
+        处理后的数据帧
+    """
+    if cols is None:
+        cols = []
     for col in cols:
     for col in cols:
-        if df[col].dtype == object:
+        if col in df.columns and df[col].dtype == object:
             df[col] = df[col].str.strip()
             df[col] = df[col].str.strip()
     return df
     return df
 
 
 
 
 # 切割数组到多个数组
 # 切割数组到多个数组
-def split_array(array, num):
+def split_array(array: List, num: int) -> List[List]:
+    """
+    将数组切割成多个子数组
+    
+    Args:
+        array: 原始数组
+        num: 每个子数组的长度
+    
+    Returns:
+        子数组列表
+    """
     return [array[i:i + num] for i in range(0, len(array), num)]
     return [array[i:i + num] for i in range(0, len(array), num)]
 
 
 
 
-def find_read_header(file_path, trans_cols, resolve_col_prefix=None):
+def find_read_header(file_path: str, trans_cols: List[str], resolve_col_prefix: Optional[str] = None) -> Optional[int]:
+    """
+    查找文件的表头行
+    
+    Args:
+        file_path: 文件路径
+        trans_cols: 要匹配的列名列表
+        resolve_col_prefix: 列名前缀解析表达式
+    
+    Returns:
+        表头行索引
+    """
     df = read_file_to_df(file_path, nrows=20)
     df = read_file_to_df(file_path, nrows=20)
     df.reset_index(inplace=True)
     df.reset_index(inplace=True)
     count = 0
     count = 0
@@ -59,7 +102,7 @@ def find_read_header(file_path, trans_cols, resolve_col_prefix=None):
 
 
     for col in trans_cols:
     for col in trans_cols:
         if col in df_cols:
         if col in df_cols:
-            count = count + 1
+            count += 1
             if count >= 2:
             if count >= 2:
                 header = 0
                 header = 0
                 break
                 break
@@ -73,7 +116,7 @@ def find_read_header(file_path, trans_cols, resolve_col_prefix=None):
             values = row.values
             values = row.values
         for col in trans_cols:
         for col in trans_cols:
             if col in values:
             if col in values:
-                count = count + 1
+                count += 1
                 if count > 2:
                 if count > 2:
                     header = index + 1
                     header = index + 1
                     return header
                     return header
@@ -82,30 +125,44 @@ def find_read_header(file_path, trans_cols, resolve_col_prefix=None):
 
 
 
 
 # 读取数据到df
 # 读取数据到df
-def read_file_to_df(file_path, read_cols=list(), trans_cols=None, nrows=None, not_find_header='raise',
-                    resolve_col_prefix=None):
+def read_file_to_df(file_path: str, read_cols: Optional[List[str]] = None, trans_cols: Optional[List[str]] = None,
+                    nrows: Optional[int] = None, not_find_header: str = 'raise',
+                    resolve_col_prefix: Optional[str] = None) -> pd.DataFrame:
+    """
+    读取文件到数据帧
+    
+    Args:
+        file_path: 文件路径
+        read_cols: 要读取的列列表
+        trans_cols: 要匹配的列名列表
+        nrows: 读取的行数
+        not_find_header: 未找到表头时的处理方式
+        resolve_col_prefix: 列名前缀解析表达式
+    
+    Returns:
+        读取的数据帧
+    """
     begin = datetime.datetime.now()
     begin = datetime.datetime.now()
-    trans_print('开始读取文件', file_path)
+    debug('开始读取文件', file_path)
     header = 0
     header = 0
-    find_cols = list()
     if trans_cols:
     if trans_cols:
         header = find_read_header(file_path, trans_cols, resolve_col_prefix)
         header = find_read_header(file_path, trans_cols, resolve_col_prefix)
-        trans_print(os.path.basename(file_path), "读取第", header, "行")
+        debug(os.path.basename(file_path), "读取第", header, "行")
         if header is None:
         if header is None:
             if not_find_header == 'raise':
             if not_find_header == 'raise':
                 message = '未匹配到开始行,请检查并重新指定'
                 message = '未匹配到开始行,请检查并重新指定'
-                trans_print(message)
+                debug(message)
                 raise Exception(message)
                 raise Exception(message)
             elif not_find_header == 'ignore':
             elif not_find_header == 'ignore':
                 pass
                 pass
 
 
-    # read_cols.extend(find_cols)
     df = pd.DataFrame()
     df = pd.DataFrame()
     if header is not None:
     if header is not None:
         try:
         try:
-            if str(file_path).lower().endswith("csv") or str(file_path).lower().endswith("gz"):
+            file_path_lower = str(file_path).lower()
+            if file_path_lower.endswith("csv") or file_path_lower.endswith("gz"):
                 encoding = detect_file_encoding(file_path)
                 encoding = detect_file_encoding(file_path)
-                end_with_gz = str(file_path).lower().endswith("gz")
+                end_with_gz = file_path_lower.endswith("gz")
                 if read_cols:
                 if read_cols:
                     if end_with_gz:
                     if end_with_gz:
                         df = pd.read_csv(file_path, encoding=encoding, usecols=read_cols, compression='gzip',
                         df = pd.read_csv(file_path, encoding=encoding, usecols=read_cols, compression='gzip',
@@ -115,7 +172,6 @@ def read_file_to_df(file_path, read_cols=list(), trans_cols=None, nrows=None, no
                         df = pd.read_csv(file_path, encoding=encoding, usecols=read_cols, header=header,
                         df = pd.read_csv(file_path, encoding=encoding, usecols=read_cols, header=header,
                                          on_bad_lines='warn', nrows=nrows)
                                          on_bad_lines='warn', nrows=nrows)
                 else:
                 else:
-
                     if end_with_gz:
                     if end_with_gz:
                         df = pd.read_csv(file_path, encoding=encoding, compression='gzip', header=header, nrows=nrows)
                         df = pd.read_csv(file_path, encoding=encoding, compression='gzip', header=header, nrows=nrows)
                     else:
                     else:
@@ -135,16 +191,25 @@ def read_file_to_df(file_path, read_cols=list(), trans_cols=None, nrows=None, no
                     now_df['sheet_name'] = sheet_name
                     now_df['sheet_name'] = sheet_name
                     df = pd.concat([df, now_df])
                     df = pd.concat([df, now_df])
                 xls.close()
                 xls.close()
-            trans_print('文件读取成功:', file_path, '数据数量:', df.shape, '耗时:', datetime.datetime.now() - begin)
+            debug('文件读取成功:', file_path, '数据数量:', df.shape, '耗时:', datetime.datetime.now() - begin)
         except Exception as e:
         except Exception as e:
-            trans_print('读取文件出错', file_path, str(e))
+            error('读取文件出错', file_path, str(e))
             message = '文件:' + os.path.basename(file_path) + ',' + str(e)
             message = '文件:' + os.path.basename(file_path) + ',' + str(e)
             raise ValueError(message)
             raise ValueError(message)
 
 
     return df
     return df
 
 
 
 
-def __build_directory_dict(directory_dict, path, filter_types=None):
+def __build_directory_dict(directory_dict: Dict[str, List[str]], path: str,
+                           filter_types: Optional[List[str]] = None) -> None:
+    """
+    构建目录文件字典
+    
+    Args:
+        directory_dict: 目录文件字典
+        path: 目录路径
+        filter_types: 文件类型过滤器
+    """
     # 遍历目录下的所有项
     # 遍历目录下的所有项
     for item in os.listdir(path):
     for item in os.listdir(path):
         item_path = os.path.join(path, item)
         item_path = os.path.join(path, item)
@@ -156,18 +221,31 @@ def __build_directory_dict(directory_dict, path, filter_types=None):
 
 
             if filter_types is None or len(filter_types) == 0:
             if filter_types is None or len(filter_types) == 0:
                 directory_dict[path].append(item_path)
                 directory_dict[path].append(item_path)
-            elif str(item_path).split(".")[-1] in filter_types:
-                if str(item_path).count("~$") == 0:
+            else:
+                # 获取文件扩展名
+                ext = os.path.splitext(item_path)[1].lstrip('.').lower()
+                if ext in filter_types and "~$" not in item_path:
                     directory_dict[path].append(item_path)
                     directory_dict[path].append(item_path)
 
 
 
 
 # 读取路径下所有的excel文件
 # 读取路径下所有的excel文件
-def read_excel_files(read_path, filter_types=None):
+def read_excel_files(read_path: str, filter_types: Optional[List[str]] = None) -> List[str]:
+    """
+    读取路径下所有的Excel文件
+    
+    Args:
+        read_path: 读取路径
+        filter_types: 文件类型过滤器
+    
+    Returns:
+        文件路径列表
+    """
     if not os.path.exists(read_path):
     if not os.path.exists(read_path):
         return []
         return []
 
 
     if filter_types is None:
     if filter_types is None:
-        filter_types = ['xls', 'xlsx', 'csv', 'gz']
+        # filter_types = ['xls', 'xlsx', 'csv', 'gz']
+        filter_types = FileTypes.EXCEL_TYPES
     if os.path.isfile(read_path):
     if os.path.isfile(read_path):
         return [read_path]
         return [read_path]
 
 
@@ -178,10 +256,20 @@ def read_excel_files(read_path, filter_types=None):
 
 
 
 
 # 读取路径下所有的文件
 # 读取路径下所有的文件
-def read_files(read_path, filter_types=None):
+def read_files(read_path: str, filter_types: Optional[List[str]] = None) -> List[str]:
+    """
+    读取路径下所有的文件
+    
+    Args:
+        read_path: 读取路径
+        filter_types: 文件类型过滤器
+    
+    Returns:
+        文件路径列表
+    """
     if filter_types is None:
     if filter_types is None:
-        filter_types = [i for i in excel_types]
-        filter_types.extend(zip_types)
+        filter_types = list(FileTypes.EXCEL_TYPES)
+        filter_types.extend(FileTypes.ZIP_TYPES)
     if os.path.isfile(read_path):
     if os.path.isfile(read_path):
         return [read_path]
         return [read_path]
     directory_dict = {}
     directory_dict = {}
@@ -190,10 +278,15 @@ def read_files(read_path, filter_types=None):
     return [path1 for paths in directory_dict.values() for path1 in paths if path1]
     return [path1 for paths in directory_dict.values() for path1 in paths if path1]
 
 
 
 
-def copy_to_new(from_path, to_path):
-    is_file = False
-    if to_path.count('.') > 0:
-        is_file = True
+def copy_to_new(from_path: str, to_path: str) -> None:
+    """
+    复制文件到新路径
+    
+    Args:
+        from_path: 源文件路径
+        to_path: 目标文件路径
+    """
+    is_file = '.' in to_path
 
 
     create_file_path(to_path, is_file_path=is_file)
     create_file_path(to_path, is_file_path=is_file)
 
 
@@ -201,11 +294,13 @@ def copy_to_new(from_path, to_path):
 
 
 
 
 # 创建路径
 # 创建路径
-def create_file_path(read_path, is_file_path=False):
+def create_file_path(read_path: str, is_file_path: bool = False) -> None:
     """
     """
     创建路径
     创建路径
-    :param read_path:创建文件夹的路径
-    :param is_file_path: 传入的path是否包含具体的文件名
+    
+    Args:
+        read_path: 创建文件夹的路径
+        is_file_path: 传入的path是否包含具体的文件名
     """
     """
     if is_file_path:
     if is_file_path:
         read_path = os.path.dirname(read_path)
         read_path = os.path.dirname(read_path)
@@ -214,9 +309,15 @@ def create_file_path(read_path, is_file_path=False):
         os.makedirs(read_path, exist_ok=True)
         os.makedirs(read_path, exist_ok=True)
 
 
 
 
-def valid_eval(eval_str):
+def valid_eval(eval_str: str) -> bool:
     """
     """
     验证 eval 是否包含非法的参数
     验证 eval 是否包含非法的参数
+    
+    Args:
+        eval_str: 要验证的表达式
+    
+    Returns:
+        是否合法
     """
     """
     safe_param = ["column", "wind_name", "df", "error_time", "str", "int"]
     safe_param = ["column", "wind_name", "df", "error_time", "str", "int"]
     eval_str_names = [node.id for node in ast.walk(ast.parse(eval_str)) if isinstance(node, ast.Name)]
     eval_str_names = [node.id for node in ast.walk(ast.parse(eval_str)) if isinstance(node, ast.Name)]

+ 0 - 202
utils/file/trans_methods.py_1

@@ -1,202 +0,0 @@
-# -*- coding: utf-8 -*-
-# @Time    : 2024/5/16
-# @Author  : 魏志亮
-import datetime
-from os import *
-import shutil
-import warnings
-
-import chardet
-import pandas as pd
-
-from utils.log.trans_log import trans_print
-
-warnings.filterwarnings("ignore")
-
-
-# 获取文件编码
-def detect_file_encoding(filename):
-    # 读取文件的前1000个字节(足够用于大多数编码检测)
-    with open(filename, 'rb') as f:
-        rawdata = f.read(1000)
-    result = chardet.detect(rawdata)
-    encoding = result['encoding']
-
-    trans_print("文件类型:", filename, encoding)
-
-    if encoding is None:
-        encoding = 'gb18030'
-
-    if encoding.lower() in ['utf-8', 'ascii', 'utf8']:
-        return 'utf-8'
-
-    return 'gb18030'
-
-
-def del_blank(df=pd.DataFrame(), cols=list()):
-    for col in cols:
-        if df[col].dtype == object:
-            df[col] = df[col].str.strip()
-    return df
-
-
-# 切割数组到多个数组
-def split_array(array, num):
-    return [array[i:i + num] for i in range(0, len(array), num)]
-
-
-def find_read_header(file_path, trans_cols):
-    df = read_file_to_df(file_path, nrows=20)
-    count = 0
-    header = None
-    for col in trans_cols:
-        if col in df.columns:
-            count = count + 1
-            if count >= 2:
-                header = 0
-                break
-
-    count = 0
-
-    values = list()
-    for index, row in df.iterrows():
-        values = list(row.values)
-        if type(row.name) == tuple:
-            values.extend(list(row.name))
-        for col in trans_cols:
-            if col in values:
-                count = count + 1
-                if count > 2:
-                    header = index + 1
-                    break
-
-    read_cols = []
-    for col in values:
-        if col in trans_cols:
-            read_cols.append(col)
-
-    return header, read_cols
-
-
-# 读取数据到df
-def read_file_to_df(file_path, read_cols=list(), trans_cols=None, nrows=None):
-    begin = datetime.datetime.now()
-    trans_print('开始读取文件', file_path)
-    header = 0
-    find_cols = list()
-    if trans_cols:
-        header, find_cols = find_read_header(file_path, trans_cols)
-        trans_print(path.basename(file_path), "读取第", header, "行")
-        if header is None:
-            message = '未匹配到开始行,请检查并重新指定'
-            trans_print(message)
-            raise Exception(message)
-
-    read_cols.extend(find_cols)
-
-    try:
-        df = pd.DataFrame()
-        if str(file_path).lower().endswith("csv") or str(file_path).lower().endswith("gz"):
-            encoding = detect_file_encoding(file_path)
-            end_with_gz = str(file_path).lower().endswith("gz")
-            if read_cols:
-                if end_with_gz:
-                    df = pd.read_csv(file_path, encoding=encoding, usecols=read_cols, compression='gzip', header=header,
-                                     nrows=nrows)
-                else:
-                    df = pd.read_csv(file_path, encoding=encoding, usecols=read_cols, header=header,
-                                     on_bad_lines='warn', nrows=nrows)
-            else:
-
-                if end_with_gz:
-                    df = pd.read_csv(file_path, encoding=encoding, compression='gzip', header=header, nrows=nrows)
-                else:
-                    df = pd.read_csv(file_path, encoding=encoding, header=header, on_bad_lines='warn', nrows=nrows)
-
-        else:
-            xls = pd.ExcelFile(file_path, engine="calamine")
-            # 获取所有的sheet名称
-            sheet_names = xls.sheet_names
-            for sheet_name in sheet_names:
-                if read_cols:
-                    now_df = pd.read_excel(xls, sheet_name=sheet_name, header=header, usecols=read_cols, nrows=nrows)
-                else:
-                    now_df = pd.read_excel(xls, sheet_name=sheet_name, header=header, nrows=nrows)
-
-                now_df['sheet_name'] = sheet_name
-                df = pd.concat([df, now_df])
-            xls.close()
-        trans_print('文件读取成功:', file_path, '数据数量:', df.shape, '耗时:', datetime.datetime.now() - begin)
-    except Exception as e:
-        trans_print('读取文件出错', file_path, str(e))
-        message = '文件:' + path.basename(file_path) + ',' + str(e)
-        raise ValueError(message)
-
-    return df
-
-
-def __build_directory_dict(directory_dict, path, filter_types=None):
-    # 遍历目录下的所有项
-    for item in listdir(path):
-        item_path = path.join(path, item)
-        if path.isdir(item_path):
-            __build_directory_dict(directory_dict, item_path, filter_types=filter_types)
-        elif path.isfile(item_path):
-            if path not in directory_dict:
-                directory_dict[path] = []
-
-            if filter_types is None or len(filter_types) == 0:
-                directory_dict[path].append(item_path)
-            elif str(item_path).split(".")[-1] in filter_types:
-                if str(item_path).count("~$") == 0:
-                    directory_dict[path].append(item_path)
-
-
-# 读取路径下所有的excel文件
-def read_excel_files(read_path):
-    if path.isfile(read_path):
-        return [read_path]
-
-    directory_dict = {}
-    __build_directory_dict(directory_dict, read_path, filter_types=['xls', 'xlsx', 'csv', 'gz'])
-
-    return [path for paths in directory_dict.values() for path in paths if path]
-
-
-# 读取路径下所有的文件
-def read_files(read_path):
-    directory_dict = {}
-    __build_directory_dict(directory_dict, read_path, filter_types=['xls', 'xlsx', 'csv', 'gz', 'zip', 'rar'])
-
-    return [path for paths in directory_dict.values() for path in paths if path]
-
-
-def copy_to_new(from_path, to_path):
-    is_file = False
-    if to_path.count('.') > 0:
-        is_file = True
-
-    create_file_path(to_path, is_file_path=is_file)
-
-    shutil.copy(from_path, to_path)
-
-
-# 创建路径
-def create_file_path(path, is_file_path=False):
-    if is_file_path:
-        path = path.dirname(path)
-
-    if not path.exists(path):
-        makedirs(path, exist_ok=True)
-
-
-if __name__ == '__main__':
-    datas = read_excel_files(r"D:\data\清理数据\招远风电场\WOF053600062-WOB000009_ZYFDC000012\minute")
-    for data in datas:
-        print(data)
-
-    print("*" * 20)
-
-    datas = read_excel_files(r"D:\data\清理数据\招远风电场\WOF053600062-WOB000009_ZYFDC000012\minute\WOG00066.csv.gz")
-    for data in datas:
-        print(data)

+ 99 - 25
utils/log/trans_log.py

@@ -7,6 +7,7 @@ import logging
 import sys
 import sys
 from os import *
 from os import *
 
 
+from conf.constants import Log
 from utils.conf.read_conf import read_conf, yaml_conf
 from utils.conf.read_conf import read_conf, yaml_conf
 
 
 
 
@@ -26,34 +27,107 @@ class ContextFilter(logging.Filter):
         return True
         return True
 
 
 
 
-logger = logging.getLogger("etl_tools")
-logger.setLevel(logging.INFO)
-stout_handle = logging.StreamHandler(sys.stdout)
-stout_handle.setFormatter(
-    logging.Formatter("%(asctime)s-%(trace_id)s: %(message)s"))
-stout_handle.setLevel(logging.INFO)
-stout_handle.addFilter(ContextFilter())
-logger.addHandler(stout_handle)
+# 初始化日志配置
+def init_logger():
+    """初始化日志配置"""
+    logger = logging.getLogger("etl_tools")
+    logger.setLevel(logging.DEBUG)  # 设置为DEBUG以捕获所有级别的日志
 
 
-config_path = path.abspath(__file__).split("utils")[0] + 'conf' + sep + 'etl_config_dev.yaml'
-config_path = environ.get('ETL_CONF', config_path)
-config = yaml_conf(environ.get('ETL_CONF', config_path))
-log_path_dir = read_conf(config, 'log_path_dir', "/data/logs")
+    # 清除已有的处理器
+    if logger.handlers:
+        logger.handlers.clear()
 
 
-log_path = log_path_dir + sep + r'etl_tools_' + (environ['env'] if 'env' in environ else 'dev')
-file_path = path.join(log_path)
+    formatter = logging.Formatter("%(asctime)s-%(levelname)s-%(trace_id)s: %(message)s")
 
 
-if not path.exists(file_path):
-    makedirs(file_path, exist_ok=True)
-file_name = file_path + sep + str(datetime.date.today()) + '.log'
+    # 控制台处理器
+    stout_handle = logging.StreamHandler(sys.stdout)
+    stout_handle.setFormatter(formatter)
 
 
-file_handler = logging.FileHandler(file_name, encoding='utf-8')
-file_handler.setFormatter(
-    logging.Formatter("%(asctime)s-%(trace_id)s: %(message)s"))
-file_handler.setLevel(logging.INFO)
-file_handler.addFilter(ContextFilter())
-logger.addHandler(file_handler)
+    # 根据环境设置日志级别
+    env = environ.get('env', 'dev')
 
 
+    stout_handle.setLevel(logging.INFO)
 
 
-def trans_print(*args):
-    logger.info("  ".join([str(a) for a in args]))
+    stout_handle.addFilter(ContextFilter())
+    logger.addHandler(stout_handle)
+
+    # 文件处理器
+    try:
+        config_path = environ.get('ETL_CONF')
+        if config_path:
+            config = yaml_conf(config_path)
+            log_path_dir = read_conf(config, 'log_path_dir', Log.DEFAULT_LOG_PATH)
+        else:
+            log_path_dir = Log.DEFAULT_LOG_PATH
+
+        log_path = log_path_dir + sep + Log.LOG_FILE_PREFIX + (environ['env'] if 'env' in environ else 'dev')
+        file_path = path.join(log_path)
+
+        if not path.exists(file_path):
+            makedirs(file_path, exist_ok=True)
+        # 普通日志文件(INFO及以上)
+        file_name = file_path + sep + str(datetime.date.today()) + '.log'
+        file_handler = logging.FileHandler(file_name, encoding='utf-8')
+        file_handler.setFormatter(formatter)
+        file_handler.setLevel(logging.INFO)
+        file_handler.addFilter(ContextFilter())
+        logger.addHandler(file_handler)
+
+        # 错误日志文件(ERROR及以上)
+        error_file_name = file_path + sep + str(datetime.date.today()) + '.error.log'
+        error_file_handler = logging.FileHandler(error_file_name, encoding='utf-8')
+        error_file_handler.setFormatter(formatter)
+        error_file_handler.setLevel(logging.ERROR)
+        error_file_handler.addFilter(ContextFilter())
+        logger.addHandler(error_file_handler)
+    except Exception as e:
+        # 如果日志文件创建失败,只使用控制台日志
+        pass
+
+    return logger
+
+
+# 初始化日志记录器
+logger = init_logger()
+
+
+def trans_print(*args, level: str = 'info'):
+    """
+    打印日志
+    
+    Args:
+        *args: 日志内容
+        level: 日志级别,可选值: 'debug', 'info', 'warning', 'error'
+    """
+    message = "  ".join([str(a) for a in args])
+
+    if level == 'debug':
+        logger.debug(message)
+    elif level == 'info':
+        logger.info(message)
+    elif level == 'warning':
+        logger.warning(message)
+    elif level == 'error':
+        logger.error(message)
+    else:
+        logger.info(message)
+
+
+def debug(*args):
+    """打印调试日志"""
+    trans_print(*args, level='debug')
+
+
+def info(*args):
+    """打印信息日志"""
+    trans_print(*args, level='info')
+
+
+def warning(*args):
+    """打印警告日志"""
+    trans_print(*args, level='warning')
+
+
+def error(*args):
+    """打印错误日志"""
+    trans_print(*args, level='error')

+ 113 - 26
utils/systeminfo/sysinfo.py

@@ -1,13 +1,21 @@
-from os import *
+import os
+from typing import List
 
 
 import psutil
 import psutil
 
 
-from utils.log.trans_log import trans_print
+from conf.constants import ParallelProcessing
+from utils.log.trans_log import info, debug
 
 
 
 
-def print_memory_usage(detail=""):
+def print_memory_usage(detail: str = "") -> None:
+    """
+    打印内存使用情况
+    
+    Args:
+        detail: 详细信息
+    """
     # 获取当前进程ID
     # 获取当前进程ID
-    pid = getpid()
+    pid = os.getpid()
     # 获取进程信息
     # 获取进程信息
     py = psutil.Process(pid)
     py = psutil.Process(pid)
     # 获取内存信息
     # 获取内存信息
@@ -21,34 +29,85 @@ def print_memory_usage(detail=""):
     memory_usage_rss_mb = memory_usage_rss / (1024 ** 2)
     memory_usage_rss_mb = memory_usage_rss / (1024 ** 2)
     memory_usage_vms_mb = memory_usage_vms / (1024 ** 2)
     memory_usage_vms_mb = memory_usage_vms / (1024 ** 2)
 
 
-    trans_print(f"{detail},Memory usage (RSS): {memory_usage_rss_mb:.2f} MB")
-    trans_print(f"{detail},Memory usage (VMS): {memory_usage_vms_mb:.2f} MB")
+    debug(f"{detail},Memory usage (RSS): {memory_usage_rss_mb:.2f} MB")
+    debug(f"{detail},Memory usage (VMS): {memory_usage_vms_mb:.2f} MB")
 
 
 
 
-def get_cpu_count():
+def get_cpu_count() -> int:
+    """
+    获取CPU核心数
+    
+    Returns:
+        CPU核心数
+    """
     return psutil.cpu_count()
     return psutil.cpu_count()
 
 
 
 
-def get_available_cpu_count_with_percent(percent: float = 1):
+def get_available_cpu_count_with_percent(percent: float = 1) -> int:
+    """
+    根据百分比获取可用CPU数
+    
+    Args:
+        percent: CPU使用百分比
+    
+    Returns:
+        可用CPU数
+    """
     cpu_count = get_cpu_count()
     cpu_count = get_cpu_count()
     return int(cpu_count * percent)
     return int(cpu_count * percent)
 
 
 
 
-def get_file_size(file_path):
-    return path.getsize(file_path)
-
-
-def get_dir_size(dir_path):
-    return sum(get_file_size(path.join(dir_path, file)) for file in listdir(dir_path) if
-               path.isfile(path.join(dir_path, file)))
-
-
-def get_available_memory_with_percent(percent: float = 1):
+def get_file_size(file_path: str) -> int:
+    """
+    获取文件大小
+    
+    Args:
+        file_path: 文件路径
+    
+    Returns:
+        文件大小(字节)
+    """
+    return os.path.getsize(file_path)
+
+
+def get_dir_size(dir_path: str) -> int:
+    """
+    获取目录大小
+    
+    Args:
+        dir_path: 目录路径
+    
+    Returns:
+        目录大小(字节)
+    """
+    return sum(get_file_size(os.path.join(dir_path, file)) for file in os.listdir(dir_path) if
+               os.path.isfile(os.path.join(dir_path, file)))
+
+
+def get_available_memory_with_percent(percent: float = 1) -> int:
+    """
+    根据百分比获取可用内存
+    
+    Args:
+        percent: 内存使用百分比
+    
+    Returns:
+        可用内存(字节)
+    """
     memory_info = psutil.virtual_memory()
     memory_info = psutil.virtual_memory()
     return int(memory_info.available * percent)
     return int(memory_info.available * percent)
 
 
 
 
-def get_max_file_size(file_paths: list[str]):
+def get_max_file_size(file_paths: List[str]) -> int:
+    """
+    获取文件列表中的最大文件大小
+    
+    Args:
+        file_paths: 文件路径列表
+    
+    Returns:
+        最大文件大小(字节)
+    """
     max_size = 0
     max_size = 0
     for file_path in file_paths:
     for file_path in file_paths:
         file_size = get_file_size(file_path)
         file_size = get_file_size(file_path)
@@ -57,11 +116,25 @@ def get_max_file_size(file_paths: list[str]):
     return max_size
     return max_size
 
 
 
 
-def use_files_get_max_cpu_count(file_paths: list[str], memory_percent: float = 1 / 12, cpu_percent: float = 2 / 5):
+def use_files_get_max_cpu_count(file_paths: List[str], memory_percent: float = 1 / 12,
+                                cpu_percent: float = 2 / 5) -> int:
+    """
+    根据文件大小和内存情况计算最大进程数
+    
+    Args:
+        file_paths: 文件路径列表
+        memory_percent: 内存使用百分比
+        cpu_percent: CPU使用百分比
+    
+    Returns:
+        最大进程数
+    """
     max_file_size = get_max_file_size(file_paths)
     max_file_size = get_max_file_size(file_paths)
     free_memory = get_available_memory_with_percent(memory_percent)
     free_memory = get_available_memory_with_percent(memory_percent)
     count = int(free_memory / max_file_size)
     count = int(free_memory / max_file_size)
     max_cpu_count = get_available_cpu_count_with_percent(cpu_percent)
     max_cpu_count = get_available_cpu_count_with_percent(cpu_percent)
+    # 限制最大进程数
+    max_cpu_count = min(max_cpu_count, ParallelProcessing.MAX_PROCESSES)
     result = count if count <= max_cpu_count else max_cpu_count
     result = count if count <= max_cpu_count else max_cpu_count
     if result == 0:
     if result == 0:
         result = 1
         result = 1
@@ -69,21 +142,35 @@ def use_files_get_max_cpu_count(file_paths: list[str], memory_percent: float = 1
     if result > len(file_paths):
     if result > len(file_paths):
         result = len(file_paths)
         result = len(file_paths)
 
 
-    trans_print("总文件数:", len(file_paths), ",获取最大文件大小:", str(round(max_file_size / 2 ** 20, 2)) + "M",
-                "可用内存:", str(get_available_memory_with_percent(1) / 2 ** 20) + "M",
-                "总CPU数:", get_cpu_count(), "CPU使用比例:", round(cpu_percent, 2), "CPU可用数量:", max_cpu_count,
-                ",最终确定使用进程数:", result)
+    info("总文件数:", len(file_paths), ",获取最大文件大小:", str(round(max_file_size / 2 ** 20, 2)) + "M",
+         "可用内存:", str(get_available_memory_with_percent(1) / 2 ** 20) + "M",
+         "总CPU数:", get_cpu_count(), "CPU使用比例:", round(cpu_percent, 2), "CPU可用数量:", max_cpu_count,
+         ",最终确定使用进程数:", result)
     return result
     return result
 
 
 
 
-def max_file_size_get_max_cpu_count(max_file_size, memory_percent: float = 1 / 6, cpu_percent: float = 2 / 5):
+def max_file_size_get_max_cpu_count(max_file_size: int, memory_percent: float = 1 / 6,
+                                    cpu_percent: float = 2 / 5) -> int:
+    """
+    根据最大文件大小和内存情况计算最大进程数
+    
+    Args:
+        max_file_size: 最大文件大小
+        memory_percent: 内存使用百分比
+        cpu_percent: CPU使用百分比
+    
+    Returns:
+        最大进程数
+    """
     free_memory = get_available_memory_with_percent(memory_percent)
     free_memory = get_available_memory_with_percent(memory_percent)
     count = int(free_memory / max_file_size)
     count = int(free_memory / max_file_size)
     max_cpu_count = get_available_cpu_count_with_percent(cpu_percent)
     max_cpu_count = get_available_cpu_count_with_percent(cpu_percent)
+    # 限制最大进程数
+    max_cpu_count = min(max_cpu_count, ParallelProcessing.MAX_PROCESSES)
     result = count if count <= max_cpu_count else max_cpu_count
     result = count if count <= max_cpu_count else max_cpu_count
     if result == 0:
     if result == 0:
         result = 1
         result = 1
-    trans_print(",获取最大文件大小:", str(round(max_file_size / 2 ** 20, 2)) + "M",
+    info(",获取最大文件大小:", str(round(max_file_size / 2 ** 20, 2)) + "M",
                 "可用内存:", str(get_available_memory_with_percent(1) / 2 ** 20) + "M",
                 "可用内存:", str(get_available_memory_with_percent(1) / 2 ** 20) + "M",
                 "总CPU数:", get_cpu_count(), "CPU使用比例:", round(cpu_percent, 2), "CPU可用数量:", max_cpu_count,
                 "总CPU数:", get_cpu_count(), "CPU使用比例:", round(cpu_percent, 2), "CPU可用数量:", max_cpu_count,
                 ",最终确定使用进程数:", result)
                 ",最终确定使用进程数:", result)

+ 0 - 0
utils/tmp_util/__init__.py


+ 0 - 37
utils/tmp_util/合并文件.py

@@ -1,37 +0,0 @@
-import multiprocessing
-
-read_dir = r'/data/download/collection_data/1进行中/张崾先风电场-陕西-华电/收资数据/整改复核数据/2025年06月19日16时17分41秒'
-
-import os
-import pandas as pd
-
-# 获取文件夹下所有文件的路径
-file_paths = [os.path.join(read_dir, file) for file in os.listdir(read_dir) if
-              os.path.isfile(os.path.join(read_dir, file))]
-
-
-def read_and_save(wind_no, files, save_dir):
-    # 读取文件
-    df = pd.concat([pd.read_csv(file) for file in files])
-
-    # 保存文件
-    df.to_csv(os.path.join(save_dir, f'{wind_no}.csv'), index=False, encoding='utf-8')
-
-
-if __name__ == '__main__':
-
-    wind_dicts = dict()
-
-    save_dir = r'/data/download/collection_data/1进行中/张崾先风电场-陕西-华电/收资数据/整改复核数据/合并202506191654'
-
-    os.makedirs(save_dir, exist_ok=True)
-
-    for file in os.listdir(read_dir):
-        wind_no = file.split('(')[0]
-        if wind_no not in wind_dicts:
-            wind_dicts[wind_no] = [os.path.join(read_dir, file)]
-        else:
-            wind_dicts[wind_no].append(os.path.join(read_dir, file))
-
-    with multiprocessing.Pool(20) as pool:
-        pool.starmap(read_and_save, [(key, files, save_dir) for key, files in wind_dicts.items()])

+ 0 - 100
utils/tmp_util/整理INSERT到批量INSERT.py

@@ -1,100 +0,0 @@
-# coding=utf-8
-
-
-import re
-from collections import defaultdict
-
-import pymysql
-
-
-def read_sql_inserts(file_path):
-    """生成器函数,逐行读取INSERT语句"""
-    with open(file_path, 'r', encoding='utf-8') as f:
-        for line in f:
-            line = line.strip()
-            if line.startswith('INSERT INTO'):
-                yield line
-
-
-def process_large_sql_file(input_file, batch_size=10000):
-    table_data = defaultdict(lambda: {
-        'columns': None,
-        'value_rows': []
-    })
-
-    insert_pattern = re.compile(
-        r'INSERT\s+INTO\s+`?([a-zA-Z_][a-zA-Z0-9_]*)`?\s*\((.*?)\)\s*VALUES\s*\((.*?)\);',
-        re.IGNORECASE
-    )
-
-    # 使用生成器处理
-    for insert_stmt in read_sql_inserts(input_file):
-        match = insert_pattern.match(insert_stmt)
-        if match:
-            table_name = match.group(1)
-            columns = match.group(2)
-            values = match.group(3)
-
-            if table_data[table_name]['columns'] is None:
-                table_data[table_name]['columns'] = columns
-
-            table_data[table_name]['value_rows'].append(values)
-
-    # 生成批量INSERT语句
-    batch_inserts = {}
-    for table_name, data in table_data.items():
-        columns = data['columns']
-        value_rows = data['value_rows']
-
-        for i in range(0, len(value_rows), batch_size):
-            batch_values = value_rows[i:i + batch_size]
-            batch_insert = f"INSERT INTO `{table_name}` ({columns}) VALUES\n"
-            batch_insert += ",\n".join([f"({values})" for values in batch_values])
-            batch_insert += ";"
-
-            if table_name not in batch_inserts:
-                batch_inserts[table_name] = []
-            batch_inserts[table_name].append(batch_insert)
-
-    return batch_inserts
-
-
-def execute_batch_inserts(db_config, batch_inserts):
-    """直接执行批量INSERT到数据库"""
-    connection = pymysql.connect(**db_config)
-    try:
-        with connection.cursor() as cursor:
-            for table_name, inserts in batch_inserts.items():
-                for index, insert_sql in enumerate(inserts):
-                    cursor.execute(insert_sql)
-                    print(f"表 {table_name},共 {len(inserts)} 个, 第 {index + 1} 个批量INSERT语句执行成功")
-        connection.commit()
-    finally:
-        connection.close()
-
-
-# 数据库配置
-db_config = {
-    'host': '192.168.50.235',
-    'user': 'root',
-    'password': 'admin123456',
-    'db': 'wtlivedb_1',
-    'charset': 'utf8mb4'
-}
-
-"""
-移除INSERT 语句 其他的就是建表语句了
-cat file |grep -v 'INSERT ' > create_talbe.sql
-下面是 INSERT 转化为  BATCH INSERT 的脚本
-"""
-
-if __name__ == "__main__":
-    input_file = "wtlivedb.sql"
-
-    # 使用
-    batch_inserts = process_large_sql_file("input.sql")
-    execute_batch_inserts(db_config, batch_inserts)
-
-    # 打印统计信息
-    for table_name, inserts in batch_inserts.items():
-        print(f"表 '{table_name}': {len(inserts)} 个批量INSERT语句")

+ 0 - 87
utils/tmp_util/神木_完整度_10分.py

@@ -1,87 +0,0 @@
-# coding=utf-8
-
-import datetime
-import multiprocessing
-import os
-import sys
-
-sys.path.insert(0, os.path.abspath(__file__).split("utils")[0])
-
-import pandas as pd
-
-from utils.file.trans_methods import read_file_to_df, read_excel_files
-
-
-def get_time_space_count(start_time: datetime.datetime, end_time: datetime.datetime, time_space=1):
-    """
-    获取俩个时间之间的个数
-    :return: 查询时间间隔
-    """
-    delta = end_time - start_time
-    total_seconds = delta.days * 24 * 60 * 60 + delta.seconds
-
-    return abs(int(total_seconds / time_space)) + 1
-
-
-def save_percent(value, save_decimal=7):
-    return round(value, save_decimal) * 100
-
-
-def read_and_select(file_path, read_cols_bak):
-    try:
-        read_cols = read_cols_bak[0:len(read_cols_bak)]
-        result_df = pd.DataFrame()
-        df = read_file_to_df(file_path, read_cols=read_cols)
-        wind_name = df['名称'].values[0]
-        df['时间'] = pd.to_datetime(df['时间'])
-        count = get_time_space_count(df['时间'].min(), df['时间'].max(), 600)
-        repeat_time_count = df.shape[0] - len(df['时间'].unique())
-        print(wind_name, count, repeat_time_count)
-        result_df['风机号'] = [wind_name]
-        result_df['重复率'] = [save_percent(repeat_time_count / count)]
-        result_df['重复次数'] = [repeat_time_count]
-        result_df['总记录数'] = [count]
-
-        read_cols.remove('名称')
-        for read_col in read_cols:
-
-            if read_col != '时间':
-                df[read_col] = pd.to_numeric(df[read_col], errors='coerce')
-            else:
-                df[read_col] = pd.to_datetime(df[read_col], errors='coerce')
-
-        group_df = df.groupby(by=['名称']).count()
-        group_df.reset_index(inplace=True)
-        count_df = pd.DataFrame(group_df)
-        total_count = count_df[read_cols].values[0].sum()
-        print(wind_name, total_count, count * len(read_cols))
-        result_df['平均缺失率,单位%'] = [save_percent(1 - total_count / (count * len(read_cols)))]
-        result_df['缺失数值'] = [
-            '-'.join([f'{col_name}_{str(count - i)}' for col_name, i in zip(read_cols, count_df[read_cols].values[0])])]
-        del group_df
-
-        error_fengsu_count = df.query("(风速 < 0) | (风速 > 80)").shape[0]
-        error_yougong_gonglv = df.query("(发电机有功功率 < -200) | (发电机有功功率 > 2500)").shape[0]
-
-        result_df['平均异常率'] = [save_percent((error_fengsu_count + error_yougong_gonglv) / (2 * count))]
-    except Exception as e:
-        print(file_path)
-        raise e
-
-    return result_df
-
-
-if __name__ == '__main__':
-    read_cols_str = '名称,时间,发电机有功功率,发电机转速,发电机驱动端轴承温度,发电机非驱动端轴承温度,发电机定子U相线圈温度,发电机定子V相线圈温度,发电机定子W相线圈温度,实际扭矩,设定扭矩,仪表盘风速,舱内温度,控制柜内温度,舱外温度,风向,风速,机舱风向夹角,1#桨叶片角度,1#桨设定角度,2#桨叶片角度,2#桨设定角度,3#桨叶片角度,3#桨设定角度,1#桨电机温度,2#桨电机温度,3#桨电机温度,轮毂内温度,齿轮箱油泵吸油口油压,齿轮箱分配器位置油压,偏航液压刹车系统蓄能罐压力,主轴转速,齿轮箱油路入口温度,齿轮箱中间轴驱动端轴承温度,齿轮箱中间轴非驱动端轴承温度,齿轮箱油池温度,主轴承外圈温度,可利用率,机舱位置,总扭缆角度'
-    read_cols = [i for i in read_cols_str.split(",") if i]
-    read_dir = r'D:\data\tmp_data\10分'
-
-    files = read_excel_files(read_dir)
-
-    with multiprocessing.Pool(4) as pool:
-        dfs = pool.starmap(read_and_select, [(os.path.join(read_dir, i), read_cols) for i in files])
-
-    df = pd.concat(dfs, ignore_index=True)
-    df.sort_values(by=['风机号'], inplace=True)
-
-    df.to_csv("神木风电场-10分钟.csv", encoding='utf8', index=False)

+ 0 - 90
utils/tmp_util/神木_完整度_1分.py

@@ -1,90 +0,0 @@
-# coding=utf-8
-
-import datetime
-import multiprocessing
-import os
-import sys
-
-sys.path.insert(0, os.path.abspath(__file__).split("utils")[0])
-
-import pandas as pd
-
-from utils.file.trans_methods import read_file_to_df, read_excel_files
-
-
-def get_time_space_count(start_time: datetime.datetime, end_time: datetime.datetime, time_space=1):
-    """
-    获取俩个时间之间的个数
-    :return: 查询时间间隔
-    """
-    delta = end_time - start_time
-    total_seconds = delta.days * 24 * 60 * 60 + delta.seconds
-
-    return abs(int(total_seconds / time_space)) + 1
-
-
-def save_percent(value, save_decimal=7):
-    return round(value, save_decimal) * 100
-
-
-def read_and_select(file_path):
-    try:
-        result_df = pd.DataFrame()
-        df = read_file_to_df(file_path)
-        read_cols_bak = df.columns.tolist()
-
-        wind_name = df['名称'].values[0]
-        df['时间'] = pd.to_datetime(df['时间'])
-        count = get_time_space_count(df['时间'].min(), df['时间'].max(), 60)
-        repeat_time_count = df.shape[0] - len(df['时间'].unique())
-        print(wind_name, count, repeat_time_count)
-        result_df['风机号'] = [wind_name]
-        result_df['重复率'] = [save_percent(repeat_time_count / count)]
-        result_df['重复次数'] = [repeat_time_count]
-        result_df['总记录数'] = [count]
-
-        read_cols_bak.remove('名称')
-        read_cols = list()
-        for read_col in read_cols_bak:
-
-            if read_col == '时间':
-                df[read_col] = pd.to_datetime(df[read_col], errors='coerce')
-                read_cols.append(read_col)
-            else:
-                df[read_col] = pd.to_numeric(df[read_col], errors='coerce')
-                if not df[read_col].isnull().all():
-                    read_cols.append(read_col)
-
-        group_df = df.groupby(by=['名称']).count()
-        group_df.reset_index(inplace=True)
-        count_df = pd.DataFrame(group_df)
-        total_count = count_df[read_cols].values[0].sum()
-        print(wind_name, total_count, count * len(read_cols))
-        result_df['平均缺失率,单位%'] = [save_percent(1 - total_count / (count * len(read_cols)))]
-        result_df['缺失数值'] = [
-            '-'.join([f'{col_name}_{str(count - i)}' for col_name, i in zip(read_cols, count_df[read_cols].values[0])])]
-        del group_df
-
-        error_fengsu_count = df.query("(风速 < 0) | (风速 > 80)").shape[0]
-        error_yougong_gonglv = df.query("(发电机有功功率 < -200) | (发电机有功功率 > 2500)").shape[0]
-
-        result_df['平均异常率'] = [save_percent((error_fengsu_count + error_yougong_gonglv) / (2 * count))]
-    except Exception as e:
-        print(file_path)
-        raise e
-
-    return result_df
-
-
-if __name__ == '__main__':
-    read_dir = r'D:\data\tmp_data\1分\远景1min'
-
-    files = read_excel_files(read_dir)
-
-    with multiprocessing.Pool(4) as pool:
-        dfs = pool.map(read_and_select, files)
-
-    df = pd.concat(dfs, ignore_index=True)
-    df.sort_values(by=['风机号'], inplace=True)
-
-    df.to_csv("神木风电场-1分钟.csv", encoding='utf8', index=False)

+ 0 - 18
utils/tmp_util/获取台账所有wind表信息.py

@@ -1,18 +0,0 @@
-import sys
-from os import path, environ
-
-env = 'dev'
-if len(sys.argv) >= 2:
-    env = sys.argv[1]
-
-conf_path = path.abspath(__file__).split("energy-data-trans")[0] + f"/energy-data-trans/conf/etl_config_{env}.yaml"
-environ['ETL_CONF'] = conf_path
-environ['env'] = env
-
-from service.common_connect import plt
-
-tables = 'wind_company,wind_engine_group,wind_engine_mill,wind_exception_count,wind_field,wind_field_batch,wind_field_contract,wind_field_resource,wind_relation'
-
-for table in tables.split(','):
-    df = plt.read_sql_to_df(f"select * from {table}")
-    df.to_csv(table + '.csv', encoding='utf8', index=False)

+ 0 - 76
utils/tmp_util/表添加列.py

@@ -1,76 +0,0 @@
-import os
-import sys
-
-env = 'prod'
-if len(sys.argv) >= 2:
-    env = sys.argv[1]
-
-conf_path = os.path.abspath(__file__).split("energy-data-trans")[0] + f"/energy-data-trans/conf/etl_config_{env}.yaml"
-os.environ['ETL_CONF'] = conf_path
-os.environ['env'] = env
-
-db_last = ''
-if env != 'dev':
-    db_last = db_last + '_' + env
-
-query_sql = f"""
-SELECT
-	t.TABLE_NAME
-FROM
-	information_schema.`TABLES` t
-WHERE
-	t.TABLE_SCHEMA = 'energy_data{db_last}'
-AND t.TABLE_NAME LIKE 'WOF%%_minute'
-AND t.TABLE_NAME NOT IN (
-	SELECT
-		table_name
-	FROM
-		information_schema.`COLUMNS` a
-	WHERE
-		a.TABLE_SCHEMA = 'energy_data{db_last}'
-	AND a.TABLE_NAME LIKE 'WOF%%_minute'
-	AND a.COLUMN_NAME = 'main_bearing_temperature_2'
-)
-"""
-
-
-def get_table_count(table_name):
-    query_begin = time.time()
-    query_sql = f"""
-    select count(1) as count from {table_name}
-    """
-    print(table_name, '统计条数耗时', time.time() - query_begin, trans.execute(query_sql)[0]['count'])
-
-
-def get_update_sql(table_name):
-    update_sql = f"""
-        ALTER TABLE {table_name}
-        ADD COLUMN `main_bearing_temperature_2` double DEFAULT NULL COMMENT '主轴承轴承温度2', 
-        ADD COLUMN `grid_a_phase_current` double DEFAULT NULL COMMENT '电网A相电流',
-        ADD COLUMN `grid_b_phase_current` double DEFAULT NULL COMMENT '电网B相电流',
-        ADD COLUMN `grid_c_phase_current` double DEFAULT NULL COMMENT '电网C相电流',
-        ADD COLUMN `reactive_power` double DEFAULT NULL COMMENT '无功功率';
-        """
-    return update_sql
-
-
-if __name__ == '__main__':
-    from service.common_connect import trans
-
-    # tables = trans.execute(query_sql)
-    # print(tables)
-
-    tables = list()
-    tables.append({'TABLE_NAME': 'WOF093400005_minute'})
-
-    import time
-
-    begin_time = time.time()
-    for table in tables:
-        table_name = '`' + table['TABLE_NAME'] + '`'
-        get_table_count(table_name)
-        update_time = time.time()
-        trans.execute(get_update_sql(table_name))
-        print(table_name, '更新耗时', time.time() - update_time)
-
-    print(len(tables), '张表,总耗时:', time.time() - begin_time)

+ 0 - 49
utils/tmp_util/表添加注释.py

@@ -1,49 +0,0 @@
-import os
-import sys
-
-env = 'tidbprod'
-if len(sys.argv) >= 2:
-    env = sys.argv[1]
-
-conf_path = os.path.abspath(__file__).split("energy-data-trans")[0] + f"/energy-data-trans/conf/etl_config_{env}.yaml"
-os.environ['ETL_CONF'] = conf_path
-os.environ['env'] = env
-
-from service.common_connect import trans, plt
-
-
-def get_all_tables():
-    query_sql = f"""
-    
-    SELECT 
-        t.TABLE_NAME
-    FROM
-        information_schema.`TABLES` t
-    WHERE
-        t.TABLE_SCHEMA = 'energy_data_prod'
-"""
-
-    return trans.execute(query_sql)
-
-
-def get_all_wind_company():
-    query_sql = "SELECT t.field_code,t.field_name FROM wind_field t where t.del_state = 0"
-    datas = plt.execute(query_sql)
-    result_dict = dict()
-    for data in datas:
-        result_dict[data['field_code']] = data['field_name']
-
-    return result_dict
-
-
-if __name__ == '__main__':
-    code_name_dict = get_all_wind_company()
-    tables = get_all_tables()
-    for table in tables:
-        table_name = table['TABLE_NAME']
-
-        if table_name.startswith('WOF'):
-            field_code = table_name.split('_')[0].split('-')[0]
-            if field_code in code_name_dict.keys():
-                update_sql = f"ALTER TABLE `{table_name}` COMMENT = '{code_name_dict[field_code]}'"
-                trans.execute(update_sql)

+ 0 - 27
utils/tmp_util/颗粒度变大.py

@@ -1,27 +0,0 @@
-import os
-
-import pandas as pd
-
-
-def trans_time_granularity(read_dir: str, save_dir: str, time_str: str, time_granularity: str, group_by: list):
-    for root, dirs, files in os.walk(read_dir):
-        for file in files:
-            file_path = os.path.join(root, file)
-            df = pd.read_csv(file_path)
-            # df = df.drop(index=0)
-            df[time_str] = pd.to_datetime(df[time_str], errors='coerce')
-            df[time_str] = df[time_str].dt.ceil(time_granularity)
-            groupby_df = df.groupby(group_by).mean(numeric_only=True).reset_index()
-
-            save_file = file_path.replace(read_dir, save_dir)
-            if not os.path.exists(os.path.dirname(save_file)):
-                os.makedirs(os.path.dirname(save_file))
-
-            groupby_df.to_csv(save_file, index=False, encoding='utf-8')
-
-
-if __name__ == '__main__':
-    read_dir = r'D:\data\tmp_data\龙源\minute'
-    save_dir = r'D:\data\tmp_data\龙源\minute12'
-
-    trans_time_granularity(read_dir, save_dir, 'time_stamp', '20min', ['time_stamp'])

+ 98 - 53
utils/zip/unzip.py

@@ -1,17 +1,27 @@
 # -*- coding: utf-8 -*-
 # -*- coding: utf-8 -*-
 # @Time    : 2024/5/17
 # @Time    : 2024/5/17
 # @Author  : 魏志亮
 # @Author  : 魏志亮
+import os
 import traceback
 import traceback
 import zipfile
 import zipfile
-from os import *
+from typing import Tuple, Optional
 
 
 import rarfile
 import rarfile
 
 
-from utils.file.trans_methods import detect_file_encoding
-from utils.log.trans_log import trans_print, logger
+from utils.file.trans_methods import detect_file_encoding, create_file_path
+from utils.log.trans_log import debug, error
 
 
 
 
-def __support_gbk(zip_file: zipfile.ZipFile):
+def __support_gbk(zip_file: zipfile.ZipFile) -> zipfile.ZipFile:
+    """
+    支持GBK编码的zip文件
+    
+    Args:
+        zip_file: ZipFile对象
+    
+    Returns:
+        处理后的ZipFile对象
+    """
     name_to_info = zip_file.NameToInfo
     name_to_info = zip_file.NameToInfo
     # copy map first
     # copy map first
     for name, info in name_to_info.copy().items():
     for name, info in name_to_info.copy().items():
@@ -23,18 +33,31 @@ def __support_gbk(zip_file: zipfile.ZipFile):
     return zip_file
     return zip_file
 
 
 
 
-def unzip(zip_filepath, dest_path):
+def unzip(zip_filepath: str, dest_path: str) -> Tuple[bool, Optional[Exception]]:
+    """
+    解压zip文件
+    
+    Args:
+        zip_filepath: zip文件路径
+        dest_path: 解压目标路径
+    
+    Returns:
+        (是否成功, 错误信息)
+    """
     # 解压zip文件
     # 解压zip文件
     is_success = True
     is_success = True
-    trans_print('开始读取文件:', zip_filepath)
-    trans_print("解压到:", dest_path)
+    debug('开始读取文件:', zip_filepath)
+    debug("解压到:", dest_path)
+
+    # 确保目标路径存在
+    create_file_path(dest_path)
 
 
     try:
     try:
         if detect_file_encoding(zip_filepath).startswith("gb"):
         if detect_file_encoding(zip_filepath).startswith("gb"):
             try:
             try:
-                with __support_gbk(zipfile.ZipFile(zip_filepath, 'r'))  as zip_ref:
+                with __support_gbk(zipfile.ZipFile(zip_filepath, 'r')) as zip_ref:
                     zip_ref.extractall(dest_path)
                     zip_ref.extractall(dest_path)
-            except:
+            except Exception:
                 with zipfile.ZipFile(zip_filepath, 'r') as zip_ref:
                 with zipfile.ZipFile(zip_filepath, 'r') as zip_ref:
                     zip_ref.extractall(dest_path)
                     zip_ref.extractall(dest_path)
         else:
         else:
@@ -42,46 +65,60 @@ def unzip(zip_filepath, dest_path):
                 zip_ref.extractall(dest_path)
                 zip_ref.extractall(dest_path)
 
 
     except zipfile.BadZipFile as e:
     except zipfile.BadZipFile as e:
-        trans_print(traceback.format_exc())
+        error(traceback.format_exc())
+        is_success = False
+        error('不是zip文件:', zip_filepath)
+        return is_success, e
+    except Exception as e:
+        error(traceback.format_exc())
         is_success = False
         is_success = False
-        trans_print('不是zip文件:', zip_filepath)
+        error('解压文件出错:', zip_filepath, str(e))
         return is_success, e
         return is_success, e
 
 
     # 遍历解压后的文件
     # 遍历解压后的文件
-    dest_path = dest_path
-    trans_print('解压再次读取', dest_path)
+    debug('解压再次读取', dest_path)
     if is_success:
     if is_success:
-        for root, dirs, files in walk(dest_path):
+        for root, dirs, files in os.walk(dest_path):
             for file in files:
             for file in files:
-                file_path = path.join(root, file)
+                file_path = os.path.join(root, file)
                 # 检查文件是否是zip文件
                 # 检查文件是否是zip文件
                 if file_path.endswith('.zip'):
                 if file_path.endswith('.zip'):
                     if file_path.endswith('.csv.zip'):
                     if file_path.endswith('.csv.zip'):
-                        rename(file_path, file_path.replace(".csv.zip", ".csv.gz"))
+                        os.rename(file_path, file_path.replace(".csv.zip", ".csv.gz"))
                     else:
                     else:
                         # 如果是,递归解压
                         # 如果是,递归解压
-                        unzip(file_path, dest_path + sep + get_desc_path(str(file)))
-                        # 删除已解压的zip文件(可选)
-                        remove(file_path)
-                    # 检查文件是否是zip文件
-                if file_path.endswith('.rar'):
+                        unzip(file_path, os.path.join(dest_path, get_desc_path(str(file))))
+                        # 删除已解压的zip文件
+                        os.remove(file_path)
+                # 检查文件是否是rar文件
+                elif file_path.endswith('.rar'):
                     # 如果是,递归解压
                     # 如果是,递归解压
-                    unrar(file_path, dest_path + sep + get_desc_path(str(file)))
-                    # 删除已解压的zip文件(可选)
-                    remove(file_path)
-
-    return is_success, ''
-
-
-def unrar(rar_file_path, dest_dir):
-    # 检查目标目录是否存在,如果不存在则创建
-    # 解压zip文件
+                    unrar(file_path, os.path.join(dest_path, get_desc_path(str(file))))
+                    # 删除已解压的rar文件
+                    os.remove(file_path)
+
+    return is_success, None
+
+
+def unrar(rar_file_path: str, dest_dir: str) -> Tuple[bool, Optional[Exception]]:
+    """
+    解压rar文件
+    
+    Args:
+        rar_file_path: rar文件路径
+        dest_dir: 解压目标目录
+    
+    Returns:
+        (是否成功, 错误信息)
+    """
+    # 解压rar文件
     is_success = True
     is_success = True
-    trans_print('开始读取文件:', rar_file_path)
+    debug('开始读取文件:', rar_file_path)
     dest_path = dest_dir
     dest_path = dest_dir
-    trans_print("解压到:", dest_path)
-    if not path.exists(dest_path):
-        makedirs(dest_path)
+    debug("解压到:", dest_path)
+
+    # 确保目标路径存在
+    create_file_path(dest_path)
 
 
     try:
     try:
         # 打开RAR文件
         # 打开RAR文件
@@ -91,33 +128,41 @@ def unrar(rar_file_path, dest_dir):
                 # 解压文件到目标目录
                 # 解压文件到目标目录
                 rf.extract(member, dest_path)
                 rf.extract(member, dest_path)
     except Exception as e:
     except Exception as e:
-        trans_print(traceback.format_exc())
-        logger.exception(e)
+        error(traceback.format_exc())
         is_success = False
         is_success = False
-        trans_print('不是rar文件:', rar_file_path)
+        error('不是rar文件:', rar_file_path)
         return is_success, e
         return is_success, e
 
 
     # 遍历解压后的文件
     # 遍历解压后的文件
-    print('解压再次读取', dest_path)
+    debug('解压再次读取', dest_path)
     if is_success:
     if is_success:
-        for root, dirs, files in walk(dest_path):
+        for root, dirs, files in os.walk(dest_path):
             for file in files:
             for file in files:
-                file_path = path.join(root, file)
-                # 检查文件是否是zip文件
+                file_path = os.path.join(root, file)
+                # 检查文件是否是rar文件
                 if file_path.endswith('.rar'):
                 if file_path.endswith('.rar'):
                     # 如果是,递归解压
                     # 如果是,递归解压
                     unrar(file_path, get_desc_path(file_path))
                     unrar(file_path, get_desc_path(file_path))
-                    # 删除已解压的zip文件(可选)
-                    remove(file_path)
+                    # 删除已解压的rar文件
+                    os.remove(file_path)
 
 
-                if file_path.endswith('.zip'):
+                elif file_path.endswith('.zip'):
                     # 如果是,递归解压
                     # 如果是,递归解压
                     unzip(file_path, get_desc_path(file_path))
                     unzip(file_path, get_desc_path(file_path))
-                    # 删除已解压的zip文件(可选)
-                    remove(file_path)
-
-    return is_success, ''
-
-
-def get_desc_path(path):
-    return path[0:path.rfind(".")]
+                    # 删除已解压的zip文件
+                    os.remove(file_path)
+
+    return is_success, None
+
+
+def get_desc_path(file_path: str) -> str:
+    """
+    获取文件路径的描述路径(去除扩展名)
+    
+    Args:
+        file_path: 文件路径
+    
+    Returns:
+        去除扩展名的路径
+    """
+    return file_path[0:file_path.rfind(".")]

+ 0 - 0
wind_farm/CGN/__init__.py


+ 0 - 83
wind_farm/CGN/minute_data.py

@@ -1,83 +0,0 @@
-import datetime
-import logging
-import os
-
-import pandas as pd
-import sys
-from sqlalchemy import create_engine
-
-# 更新为第三方数据源
-engine = create_engine('mysql+pymysql://root:admin123456@192.168.50.235:30306/appoint')
-
-base_dir = r'/data/logs/104'
-save_dir = base_dir + os.sep + 'minute'
-log_dir = base_dir + os.sep + 'logs' + os.sep + 'minute'
-
-wind_farm_code_dict = {
-    '风场编号1': '山西风场',
-    '风场编号2': '桂林风场'
-}
-
-
-def create_dir(save_dir, is_file=False):
-    if is_file:
-        save_dir = os.path.dirname(save_dir)
-    os.makedirs(save_dir, exist_ok=True)
-
-
-def init_log():
-    logger = logging.getLogger("104data")
-    logger.setLevel(logging.INFO)
-    stout_handle = logging.StreamHandler(sys.stdout)
-    stout_handle.setFormatter(
-        logging.Formatter("%(asctime)s: %(message)s"))
-    stout_handle.setLevel(logging.INFO)
-    logger.addHandler(stout_handle)
-    create_dir(log_dir)
-    file_name = log_dir + os.sep + datetime.datetime.now().strftime('%Y%m') + '-info.log'
-    file_handler = logging.FileHandler(file_name, encoding='utf-8')
-    file_handler.setFormatter(
-        logging.Formatter("%(asctime)s: %(message)s"))
-    file_handler.setLevel(logging.INFO)
-    logger.addHandler(file_handler)
-
-    file_name = log_dir + os.sep + datetime.datetime.now().strftime('%Y%m') + '-error.log'
-    file_handler = logging.FileHandler(file_name, encoding='utf-8')
-    file_handler.setFormatter(
-        logging.Formatter("%(asctime)s: %(message)s"))
-    file_handler.setLevel(logging.ERROR)
-    logger.addHandler(file_handler)
-
-    return logger
-
-
-logger = init_log()
-
-
-def info_print(*kwargs):
-    message = " ".join([str(i) for i in kwargs])
-    logger.info(message)
-
-
-def error_print(*kwargs):
-    message = " ".join([str(i) for i in kwargs])
-    logger.error(message)
-
-
-def get_data_and_save_file(df_sql, save_path):
-    info_print(df_sql)
-    df = pd.read_sql_query(df_sql, engine)
-    info_print(df.shape)
-
-
-if __name__ == '__main__':
-    info_print("开始执行")
-    begin = datetime.datetime.now()
-    yestoday = (datetime.datetime.now() - datetime.timedelta(days=1)).strftime('%Y%m%d')
-    yestoday_sql = f"select * from information_schema.TABLES where TABLE_NAME = {yestoday}"
-
-    get_data_and_save_file(yestoday_sql,
-                           os.path.join(save_dir, wind_farm_code_dict['风场编号1'], yestoday[0:4], yestoday[0:6],
-                                        f'{yestoday}.csv.gz'))
-
-    info_print("执行结束,总耗时:", datetime.datetime.now() - begin)

+ 0 - 83
wind_farm/CGN/purge_history_data.py

@@ -1,83 +0,0 @@
-import datetime
-import logging
-import os
-import sys
-
-import pandas as pd
-from sqlalchemy import create_engine, text
-
-engine = create_engine('mysql+pymysql://root:admin123456@192.168.50.235:30306/appoint')
-
-base_dir = r'/data/logs/104'
-log_dir = base_dir + os.sep + 'logs' + os.sep + 'delete'
-
-
-def create_dir(save_dir, is_file=False):
-    if is_file:
-        save_dir = os.path.dirname(save_dir)
-    os.makedirs(save_dir, exist_ok=True)
-
-
-def init_log():
-    logger = logging.getLogger("104data")
-    logger.setLevel(logging.INFO)
-    stout_handle = logging.StreamHandler(sys.stdout)
-    stout_handle.setFormatter(
-        logging.Formatter("%(asctime)s: %(message)s"))
-    stout_handle.setLevel(logging.INFO)
-    logger.addHandler(stout_handle)
-    create_dir(log_dir)
-    file_name = log_dir + os.sep + datetime.datetime.now().strftime('%Y%m') + '-info.log'
-    file_handler = logging.FileHandler(file_name, encoding='utf-8')
-    file_handler.setFormatter(
-        logging.Formatter("%(asctime)s: %(message)s"))
-    file_handler.setLevel(logging.INFO)
-    logger.addHandler(file_handler)
-
-    file_name = log_dir + os.sep + datetime.datetime.now().strftime('%Y%m') + '-error.log'
-    file_handler = logging.FileHandler(file_name, encoding='utf-8')
-    file_handler.setFormatter(
-        logging.Formatter("%(asctime)s: %(message)s"))
-    file_handler.setLevel(logging.ERROR)
-    logger.addHandler(file_handler)
-
-    return logger
-
-
-logger = init_log()
-
-
-def info_print(*kwargs):
-    message = " ".join([str(i) for i in kwargs])
-    logger.info(message)
-
-
-def error_print(*kwargs):
-    message = " ".join([str(i) for i in kwargs])
-    logger.error(message)
-
-
-def drop_table(lastdays):
-    # 构建查询语句
-    query = text(
-        f"SELECT TABLE_NAME FROM information_schema.TABLES WHERE TABLE_SCHEMA='appoint' AND TABLE_NAME like '{lastdays}%'")
-    table_df = pd.read_sql(query, engine)
-
-    info_print('查询到表', table_df['TABLE_NAME'].values)
-    for table_name in table_df['TABLE_NAME'].values:
-        # 构建删除表的SQL语句
-        drop_query = text(f"DROP TABLE {table_name}")
-        # 执行删除操作
-        with engine.connect() as connection:
-            connection.execute(drop_query)
-
-        info_print(f"Table {table_name} deleted")
-
-
-if __name__ == '__main__':
-    info_print("开始执行")
-    begin = datetime.datetime.now()
-    lastdays = (datetime.datetime.now() - datetime.timedelta(days=8)).strftime('%Y%m%d')
-    print(lastdays)
-    drop_table(lastdays)
-    info_print("执行结束,总耗时:", datetime.datetime.now() - begin)

+ 0 - 173
wind_farm/CGN/second_data.py

@@ -1,173 +0,0 @@
-import datetime
-import json
-import logging
-import multiprocessing
-import os
-import traceback
-
-import sys
-
-import numpy as np
-import pandas as pd
-from sqlalchemy import create_engine
-
-engine = create_engine('mysql+pymysql://root:admin123456@192.168.50.235:30306/appoint')
-
-base_dir = r'/data/logs/104'
-save_dir = base_dir + os.sep + 'second'
-log_dir = base_dir + os.sep + 'logs' + os.sep + 'second'
-
-def create_dir(save_dir, is_file=False):
-    if is_file:
-        save_dir = os.path.dirname(save_dir)
-    os.makedirs(save_dir, exist_ok=True)
-
-
-def init_log():
-    logger = logging.getLogger("104data")
-    logger.setLevel(logging.INFO)
-    stout_handle = logging.StreamHandler(sys.stdout)
-    stout_handle.setFormatter(
-        logging.Formatter("%(asctime)s: %(message)s"))
-    stout_handle.setLevel(logging.INFO)
-    logger.addHandler(stout_handle)
-    create_dir(log_dir)
-    file_name = log_dir + os.sep + datetime.datetime.now().strftime('%Y%m') + '-info.log'
-    file_handler = logging.FileHandler(file_name, encoding='utf-8')
-    file_handler.setFormatter(
-        logging.Formatter("%(asctime)s: %(message)s"))
-    file_handler.setLevel(logging.INFO)
-    logger.addHandler(file_handler)
-
-    file_name = log_dir + os.sep + datetime.datetime.now().strftime('%Y%m') + '-error.log'
-    file_handler = logging.FileHandler(file_name, encoding='utf-8')
-    file_handler.setFormatter(
-        logging.Formatter("%(asctime)s: %(message)s"))
-    file_handler.setLevel(logging.ERROR)
-    logger.addHandler(file_handler)
-
-    return logger
-
-
-logger = init_log()
-
-
-def get_all_mesurement_conf():
-    sql = "select * from measurement_conf "
-    return pd.read_sql(sql, engine)
-
-
-def get_all_mesurepoint_conf():
-    sql = "select * from measurepoint_conf t where t.status = 1"
-    return pd.read_sql(sql, engine)
-
-
-def df_value_to_dict(df, key='col1', value='col2'):
-    """
-    :param df: dataframe
-    :param key: 字典的key,如果重复,则返回
-    :param value: 字典的value
-    :return:
-    """
-    result_dict = dict()
-    for k, v in zip(df[key], df[value]):
-        if k in result_dict.keys():
-            if type(result_dict[k]) == list:
-                result_dict[k].append(v)
-            else:
-                result_dict[k] = [result_dict[k]]
-                result_dict[k].append(v)
-        else:
-            result_dict[k] = v
-
-    return result_dict
-
-
-def info_print(*kwargs):
-    message = " ".join([str(i) for i in kwargs])
-    logger.info(message)
-
-
-def error_print(*kwargs):
-    message = " ".join([str(i) for i in kwargs])
-    logger.error(message)
-
-
-def exists_table(table_name):
-    sql = f"SELECT * FROM information_schema.tables WHERE table_schema = 'appoint' AND table_name = '{table_name}'"
-    info_print(sql)
-    table_df = pd.read_sql_query(sql, engine)
-    if table_df.empty:
-        return False
-    return True
-
-
-def get_data_and_save_file(table_name, save_path, measurepoint_use_dict):
-    if not exists_table(table_name):
-        error_print(f"{table_name} 表不存在")
-    else:
-        df_sql = f"SELECT * FROM {table_name}"
-        info_print(df_sql)
-        df = pd.read_sql_query(df_sql, engine)
-        info_print(df.shape)
-
-        data_dict = dict()
-        for receive_time, information_object_data in zip(df['receive_time'],
-                                                         df['information_object_data']):
-
-            json_data = json.loads(information_object_data)
-            for k, v in json_data.items():
-                k = int(k)
-                wind_num = k // 103 + 1
-                mesurepoint_num = k % 103
-
-                if wind_num not in data_dict.keys():
-                    data_dict[wind_num] = dict()
-
-                if receive_time not in data_dict[wind_num].keys():
-                    data_dict[wind_num][receive_time] = dict()
-
-                if mesurepoint_num in measurepoint_use_dict.keys():
-                    data_dict[wind_num][receive_time][mesurepoint_num] = v
-
-        datas = list()
-        for wind_num, data in data_dict.items():
-            for receive_time, mesurepoint_data in data.items():
-                data = [wind_num, receive_time]
-                for point_num in measurepoint_use_dict.keys():
-                    data.append(mesurepoint_data[point_num] if point_num in mesurepoint_data.keys() else np.nan)
-                if len(data) > 2:
-                    datas.append(data)
-
-        cols = ['风机编号', '时间']
-        cols.extend(measurepoint_use_dict.values())
-        result_df = pd.DataFrame(data=datas, columns=cols)
-        result_df.sort_values(by=['风机编号', '时间'])
-        create_dir(save_path, True)
-        result_df.to_csv(save_path, encoding='utf8', index=False, compression='gzip')
-        info_print("文件", save_path, '保存成功')
-
-
-if __name__ == '__main__':
-    info_print("开始执行")
-    begin = datetime.datetime.now()
-    try:
-        measurepoint_conf_df = get_all_mesurepoint_conf()
-        measurepoint_use_dict = df_value_to_dict(measurepoint_conf_df, 'id', 'name')
-
-        yestoday = (datetime.datetime.now() - datetime.timedelta(days=1)).strftime('%Y%m%d')
-
-        measurement_conf_df = get_all_mesurement_conf()
-        tables = list()
-        for id, measurement_wind_field in zip(measurement_conf_df['id'], measurement_conf_df['measurement_wind_field']):
-            tables.append(
-                (f'{yestoday}_{id}', os.path.join(save_dir, measurement_wind_field, yestoday[0:4], yestoday[0:6],
-                                                  yestoday + '.csv.gz')))
-
-        with multiprocessing.Pool(len(tables)) as pool:
-            pool.starmap(get_data_and_save_file, [(t[0], t[1], measurepoint_use_dict) for t in tables])
-    except Exception as e:
-        error_print(traceback.format_exc())
-        raise e
-
-    info_print("执行结束,总耗时:", datetime.datetime.now() - begin)

+ 0 - 0
wind_farm/__init__.py