wzl 1 mese fa
parent
commit
038509a11a
45 ha cambiato i file con 1720 aggiunte e 1613 eliminazioni
  1. 9 9
      app_run.py
  2. 97 0
      conf/constants.py
  3. 10 10
      conf/etl_config_dev.yaml
  4. 3 3
      etl/common/ArchiveFile.py
  5. 39 27
      etl/common/BaseDataTrans.py
  6. 3 3
      etl/common/ClearData.py
  7. 117 44
      etl/common/CombineAndSaveFormalFile.py
  8. 96 20
      etl/common/PathsAndTable.py
  9. 47 22
      etl/common/SaveToDb.py
  10. 43 18
      etl/common/UnzipAndRemove.py
  11. 3 3
      etl/wind_power/fault_warn/FaultWarnTrans.py
  12. 4 4
      etl/wind_power/laser/LaserTrans.py
  13. 6 6
      etl/wind_power/min_sec/ClassIdentifier.py
  14. 51 26
      etl/wind_power/min_sec/MinSecTrans.py
  15. 111 45
      etl/wind_power/min_sec/ReadAndSaveTmp.py
  16. 53 40
      etl/wind_power/min_sec/StatisticsAndSaveTmpFormalFile.py
  17. 42 7
      etl/wind_power/min_sec/TransParam.py
  18. 53 18
      etl/wind_power/wave/WaveTrans.py
  19. 3 3
      service/common_connect.py
  20. 6 2
      service/trans_conf_service.py
  21. 95 72
      service/trans_service.py
  22. 4 2
      utils/common.py
  23. 137 12
      utils/conf/read_conf.py
  24. 231 41
      utils/db/ConnectMysql.py
  25. 6 6
      utils/db/ConnectMysql_tidb_fix.py
  26. 2 3
      utils/df_utils/util.py
  27. 139 38
      utils/file/trans_methods.py
  28. 0 202
      utils/file/trans_methods.py_1
  29. 99 25
      utils/log/trans_log.py
  30. 113 26
      utils/systeminfo/sysinfo.py
  31. 0 0
      utils/tmp_util/__init__.py
  32. 0 37
      utils/tmp_util/合并文件.py
  33. 0 100
      utils/tmp_util/整理INSERT到批量INSERT.py
  34. 0 87
      utils/tmp_util/神木_完整度_10分.py
  35. 0 90
      utils/tmp_util/神木_完整度_1分.py
  36. 0 18
      utils/tmp_util/获取台账所有wind表信息.py
  37. 0 76
      utils/tmp_util/表添加列.py
  38. 0 49
      utils/tmp_util/表添加注释.py
  39. 0 27
      utils/tmp_util/颗粒度变大.py
  40. 98 53
      utils/zip/unzip.py
  41. 0 0
      wind_farm/CGN/__init__.py
  42. 0 83
      wind_farm/CGN/minute_data.py
  43. 0 83
      wind_farm/CGN/purge_history_data.py
  44. 0 173
      wind_farm/CGN/second_data.py
  45. 0 0
      wind_farm/__init__.py

+ 9 - 9
app_run.py

@@ -1,8 +1,8 @@
 # -*- coding: utf-8 -*-
 # @Time    : 2024/6/11
 # @Author  : 魏志亮
+import os
 import sys
-from os import *
 
 from utils.conf.read_conf import yaml_conf, read_conf
 
@@ -11,7 +11,7 @@ def get_exec_data(run_count=1):
     now_run_count = get_now_running_count()
     data = None
     if now_run_count >= run_count:
-        trans_print(f"当前有{now_run_count}个任务在执行")
+        info(f"当前有{now_run_count}个任务在执行")
     else:
         data = get_batch_exec_data()
     return data
@@ -22,7 +22,7 @@ def run(save_db=True, run_count=1, yaml_config=None, step=0, end=999):
     data = get_exec_data(run_count)
 
     if data is None:
-        trans_print("没有需要执行的任务")
+        info("没有需要执行的任务")
         return
 
     exec_process = None
@@ -55,14 +55,14 @@ if __name__ == "__main__":
     if env.endswith(".yaml"):
         conf_path = env
     else:
-        conf_path = path.abspath(f"./conf/etl_config_{env}.yaml")
+        conf_path = os.path.abspath(f"./conf/etl_config_{env}.yaml")
 
-    environ["ETL_CONF"] = conf_path
+    os.environ["ETL_CONF"] = conf_path
     yaml_config = yaml_conf(conf_path)
-    environ["env"] = env
+    os.environ["env"] = env
     run_count = int(read_conf(yaml_config, "run_batch_count", 1))
 
-    from utils.log.trans_log import trans_print
+    from utils.log.trans_log import info
     from service.trans_conf_service import (
         update_timeout_trans_data,
         get_now_running_count,
@@ -73,7 +73,7 @@ if __name__ == "__main__":
     from etl.wind_power.laser.LaserTrans import LaserTrans
     from etl.wind_power.wave.WaveTrans import WaveTrans
 
-    trans_print("所有请求参数:", sys.argv, "env:", env, "最大可执行个数:", run_count)
-    trans_print("配置文件路径:", environ.get("ETL_CONF"))
+    info("所有请求参数:", sys.argv, "env:", env, "最大可执行个数:", run_count)
+    info("配置文件路径:", os.environ.get("ETL_CONF"))
 
     run(run_count=run_count, yaml_config=yaml_config, step=0)

+ 97 - 0
conf/constants.py

@@ -0,0 +1,97 @@
+# -*- coding: utf-8 -*-
+# @Time    : 2026/3/12
+# @Author  : 系统自动生成
+# 项目常量定义
+
+
+# 文件类型常量
+class FileTypes:
+    """文件类型常量"""
+    # Excel相关文件类型
+    EXCEL_TYPES = ['xls', 'xlsx', 'xlsm', 'xlsb', 'odf', 'ods', 'csv', 'csv.gz']
+    # 压缩文件类型
+    ZIP_TYPES = ['rar', 'zip']
+
+
+# 数据处理常量
+class DataProcessing:
+    """数据处理常量"""
+    # 时间戳列名
+    TIME_STAMP_COLUMN = 'time_stamp'
+    # NaN替换值
+    NAN_REPLACE_VALUE = -999999999
+    # 有功功率单位判断阈值
+    POWER_UNIT_THRESHOLD = 100000
+    # 时间间隔
+    TIME_INTERVAL = '10T'
+    # 非数值列
+    NOT_DOUBLE_COLS = ['wind_turbine_number', 'wind_turbine_name', 'time_stamp', 
+                       'param6', 'param7', 'param8', 'param9', 'param10']
+
+
+# 并行处理常量
+class ParallelProcessing:
+    """并行处理常量"""
+    # 最大进程数
+    MAX_PROCESSES = 8
+    # 最大批次数
+    MAX_BATCHES = 10
+    # CPU使用百分比
+    CPU_USAGE_PERCENT = 2 / 3
+
+
+# 数据库常量
+class Database:
+    """数据库常量"""
+    # 表引擎
+    TABLE_ENGINE = 'InnoDB'
+    # 默认字符集
+    DEFAULT_CHARSET = 'utf8mb4'
+    # 批处理大小
+    BATCH_SIZE = 100000
+
+
+# 日志常量
+class Log:
+    """日志常量"""
+    # 默认日志路径
+    DEFAULT_LOG_PATH = "/data/logs"
+    # 日志文件名前缀
+    LOG_FILE_PREFIX = "etl_tools_"
+
+
+# 路径常量
+class Paths:
+    """路径常量"""
+    # 临时文件基础路径
+    DEFAULT_TMP_BASE_PATH = "/tmp"
+    # 归档路径
+    DEFAULT_ARCHIVE_PATH = "/tmp/archive"
+
+
+# 状态常量
+class Status:
+    """状态常量"""
+    # 成功状态
+    SUCCESS = 1
+    # 错误状态
+    ERROR = 0
+    # 运行状态
+    RUNNING = 2
+
+
+# 类型常量
+class Types:
+    """类型常量"""
+    # 秒级数据
+    SECOND = 'second'
+    # 分钟级数据
+    MINUTE = 'minute'
+    # 故障数据
+    FAULT = 'fault'
+    # 告警数据
+    WARN = 'warn'
+    # 波形数据
+    WAVE = 'wave'
+    # 激光数据
+    LASER = 'laser'

+ 10 - 10
conf/etl_config_dev.yaml

@@ -1,24 +1,24 @@
 plt:
-  database: energy_ty
+  database: energy
   host: 192.168.50.233
   password: admin123456
   port: 3306
   user: admin
 
-# trans:
-#   database: energy_data
-#   host: 192.168.50.235
-#   password: admin123456
-#   port: 30306
-#   user: root
-
 trans:
   database: energy_data
-  host: 106.120.102.238
+  host: 192.168.50.235
   password: admin123456
-  port: 10336
+  port: 30306
   user: root
 
+#trans:
+#  database: energy_data
+#  host: 106.120.102.238
+#  password: admin123456
+#  port: 10336
+#  user: root
+
 # 如果要放在原始路径,则配置这个 以下面的名称作为切割点,新建清理数据文件夹
 etl_origin_path_contain: 收资数据
 # 如果单独保存,配置这个路径

+ 3 - 3
etl/common/ArchiveFile.py

@@ -3,7 +3,7 @@ import shutil
 
 from etl.common.PathsAndTable import PathsAndTable
 from service.trans_conf_service import update_archive_success
-from utils.log.trans_log import trans_print
+from utils.log.trans_log import info
 
 
 class ArchiveFile(object):
@@ -19,6 +19,6 @@ class ArchiveFile(object):
         if os.path.exists(self.pathsAndTable.get_tmp_formal_path()):
             shutil.make_archive(self.pathsAndTable.get_archive_path(), 'zip', self.pathsAndTable.get_tmp_formal_path())
             update_archive_success(self.exec_id, f"{self.pathsAndTable.get_archive_path()}.zip")
-            trans_print(f"文件夹已归档为 {self.pathsAndTable.get_archive_path()}.zip")
+            info(f"文件夹已归档为 {self.pathsAndTable.get_archive_path()}.zip")
         else:
-            trans_print(f"文件夹 {self.pathsAndTable.get_tmp_formal_path()} 不存在")
+            info(f"文件夹 {self.pathsAndTable.get_tmp_formal_path()} 不存在")

+ 39 - 27
etl/common/BaseDataTrans.py

@@ -10,12 +10,24 @@ from service.plt_service import get_all_wind
 from service.trans_conf_service import update_trans_status_success, update_trans_status_error, \
     update_trans_status_running
 from utils.file.trans_methods import read_excel_files
-from utils.log.trans_log import trans_print, set_trance_id
+from utils.log.trans_log import set_trance_id, info, error
 
 
 class BaseDataTrans(object):
-    def __init__(self, data: dict = None, save_db=True, yaml_config=None, step=0, end=999):
-
+    """数据转换基类"""
+
+    def __init__(self, data: dict = None, save_db: bool = True, yaml_config: dict = None, step: int = 0,
+                 end: int = 999):
+        """
+        初始化数据转换基类
+        
+        Args:
+            data: 任务数据字典
+            save_db: 是否保存到数据库
+            yaml_config: YAML配置
+            step: 开始步骤
+            end: 结束步骤
+        """
         self.id = data['id']
         self.task_name = data['task_name']
         self.transfer_type = data['transfer_type']
@@ -37,7 +49,7 @@ class BaseDataTrans(object):
                                                self.wind_farm_name, self.transfer_type, save_db, self.save_zip,
                                                self.yaml_config, self.wind_col_trans)
         except Exception as e:
-            trans_print(traceback.format_exc())
+            error(traceback.format_exc())
             update_trans_status_error(self.id, str(e), self.save_db)
             raise e
 
@@ -94,70 +106,70 @@ class BaseDataTrans(object):
             # 0
             if self.step <= now_index <= self.end:
                 begin = datetime.datetime.now()
-                trans_print("开始清理数据,临时文件夹:", self.pathsAndTable.get_tmp_path())
+                info("开始清理数据,临时文件夹:", self.pathsAndTable.get_tmp_path())
                 self.clean_file_and_db()
-                trans_print("清理数据结束,耗时:", datetime.datetime.now() - begin, "总耗时:",
-                            datetime.datetime.now() - total_begin)
+                info("清理数据结束,耗时:", datetime.datetime.now() - begin, "总耗时:",
+                     datetime.datetime.now() - total_begin)
 
             now_index = now_index + 1
             # 1
             if self.step <= now_index <= self.end:
                 begin = datetime.datetime.now()
-                trans_print("开始解压移动文件")
+                info("开始解压移动文件")
                 self.unzip_or_remove_to_tmp_dir()
-                trans_print("解压移动文件结束:耗时:", datetime.datetime.now() - begin, "总耗时:",
-                            datetime.datetime.now() - total_begin)
+                info("解压移动文件结束:耗时:", datetime.datetime.now() - begin, "总耗时:",
+                     datetime.datetime.now() - total_begin)
 
             now_index = now_index + 1
             # 2
             if self.step <= now_index <= self.end:
                 begin = datetime.datetime.now()
-                trans_print("开始保存数据到临时文件")
+                info("开始保存数据到临时文件")
                 self.read_and_save_tmp_file()
-                trans_print("保存数据到临时文件结束,耗时:", datetime.datetime.now() - begin, "总耗时:",
-                            datetime.datetime.now() - total_begin)
+                info("保存数据到临时文件结束,耗时:", datetime.datetime.now() - begin, "总耗时:",
+                     datetime.datetime.now() - total_begin)
 
             now_index = now_index + 1
             # 3
             if self.step <= now_index <= self.end:
                 begin = datetime.datetime.now()
-                trans_print("开始保存到临时正式文件")
+                info("开始保存到临时正式文件")
                 self.statistics_and_save_tmp_formal_file()
-                trans_print("保存到临时正式文件结束,耗时:", datetime.datetime.now() - begin, "总耗时:",
-                            datetime.datetime.now() - total_begin)
+                info("保存到临时正式文件结束,耗时:", datetime.datetime.now() - begin, "总耗时:",
+                     datetime.datetime.now() - total_begin)
 
             now_index = now_index + 1
             # 4
             if self.step <= now_index <= self.end:
                 begin = datetime.datetime.now()
-                trans_print("开始保存归档文件")
+                info("开始保存归档文件")
                 self.archive_file()
-                trans_print("保存到保存归档文件结束,耗时:", datetime.datetime.now() - begin, "总耗时:",
-                            datetime.datetime.now() - total_begin)
+                info("保存到保存归档文件结束,耗时:", datetime.datetime.now() - begin, "总耗时:",
+                     datetime.datetime.now() - total_begin)
 
             now_index = now_index + 1
             # 5
             if self.step <= now_index <= self.end:
                 begin = datetime.datetime.now()
-                trans_print("开始保存数据到正式文件")
+                info("开始保存数据到正式文件")
                 self.combine_and_save_formal_file()
-                trans_print("保存数据到正式文件结束,耗时:", datetime.datetime.now() - begin, "总耗时:",
-                            datetime.datetime.now() - total_begin)
+                info("保存数据到正式文件结束,耗时:", datetime.datetime.now() - begin, "总耗时:",
+                     datetime.datetime.now() - total_begin)
 
             now_index = now_index + 1
             # 6
             if self.step <= now_index <= self.end:
                 begin = datetime.datetime.now()
-                trans_print("开始保存到数据库,是否存库:", self.pathsAndTable.save_db)
+                info("开始保存到数据库,是否存库:", self.pathsAndTable.save_db)
                 self.save_to_db()
-                trans_print("保存到数据结束,耗时:", datetime.datetime.now() - begin, "总耗时:",
-                            datetime.datetime.now() - total_begin)
+                info("保存到数据结束,耗时:", datetime.datetime.now() - begin, "总耗时:",
+                     datetime.datetime.now() - total_begin)
 
             self.update_exec_progress()
         except Exception as e:
-            trans_print(traceback.format_exc())
+            error(traceback.format_exc())
             update_trans_status_error(self.id, str(e), self.save_db)
             raise e
         finally:
             self.pathsAndTable.delete_tmp_files()
-            trans_print("执行结束,总耗时:", str(datetime.datetime.now() - total_begin))
+            info("执行结束,总耗时:", str(datetime.datetime.now() - total_begin))

+ 3 - 3
etl/common/ClearData.py

@@ -2,7 +2,7 @@ import datetime
 
 from etl.common.PathsAndTable import PathsAndTable
 from service.trans_conf_service import update_trans_transfer_progress
-from utils.log.trans_log import trans_print
+from utils.log.trans_log import info
 
 
 class ClearData(object):
@@ -19,8 +19,8 @@ class ClearData(object):
         # self.pathsAndTable.delete_batch_files()
 
     def run(self):
-        trans_print("开始清理数据,临时文件夹:", self.pathsAndTable.get_tmp_path())
+        info("开始清理数据,临时文件夹:", self.pathsAndTable.get_tmp_path())
         begin = datetime.datetime.now()
         self.clean_data()
         update_trans_transfer_progress(self.pathsAndTable.id, 5, self.pathsAndTable.save_db)
-        trans_print("清理数据结束,耗时:", datetime.datetime.now() - begin)
+        info("清理数据结束,耗时:", datetime.datetime.now() - begin)

+ 117 - 44
etl/common/CombineAndSaveFormalFile.py

@@ -1,61 +1,134 @@
 import multiprocessing
 import os
+from typing import Dict, List, Tuple, Optional
 
 import pandas as pd
 
+from conf.constants import DataProcessing, ParallelProcessing
 from etl.common.PathsAndTable import PathsAndTable
 from utils.file.trans_methods import read_excel_files, read_file_to_df, copy_to_new
-from utils.log.trans_log import trans_print
+from utils.log.trans_log import info, debug
 from utils.systeminfo.sysinfo import get_available_cpu_count_with_percent
 
 
-class CombineAndSaveFormalFile(object):
+class CombineAndSaveFormalFile:
+    """合并并保存正式文件"""
 
-    def __init__(self, pathsAndTable: PathsAndTable):
-        self.pathsAndTable = pathsAndTable
-        self.update_files = multiprocessing.Manager().list()
+    # 常量定义
+    TIME_STAMP_COLUMN = DataProcessing.TIME_STAMP_COLUMN
 
-    def combine_and_save(self, file_path, key, exists_file_path):
-        exists_same = False
-        if exists_file_path:
-            exists_same = True
+    def __init__(self, paths_and_table: PathsAndTable):
+        """
+        初始化合并器
+
+        Args:
+            paths_and_table: 路径和表信息对象
+        """
+        self.paths_and_table = paths_and_table
+        self.updated_files = multiprocessing.Manager().list()
+
+    def _merge_dataframes(self, exists_df: pd.DataFrame, now_df: pd.DataFrame) -> pd.DataFrame:
+        """
+        合并两个数据框并去重排序
+
+        Args:
+            exists_df: 已存在的数据框
+            now_df: 当前的数据框
+
+        Returns:
+            合并后的数据框
+        """
+        combined_df = pd.concat([exists_df, now_df])
+        # 去重,保留最新的数据
+        combined_df = combined_df.drop_duplicates(
+            subset=self.TIME_STAMP_COLUMN,
+            keep='last'
+        )
+        # 按时间戳排序
+        return combined_df.sort_values(
+            by=self.TIME_STAMP_COLUMN
+        ).reset_index(drop=True)
+
+    def _save_combined_file(self, file_path: str, key: Tuple[str, str], exists_file_path: Optional[str]) -> None:
+        """
+        保存合并后的文件
+
+        Args:
+            file_path: 新文件路径
+            key: 文件键值 (目录名, 文件名)
+            exists_file_path: 已存在的文件路径,如果为None则表示不存在
+        """
+        has_exists = exists_file_path is not None
+
+        if has_exists:
+            # 合并并保存
             exists_df = read_file_to_df(exists_file_path)
             now_df = read_file_to_df(file_path)
-            # 合并两个 DataFrame
-            combined_df = pd.concat([exists_df, now_df])
-            # 去重,保留 now_df 的值
-            combined_df = combined_df.drop_duplicates(subset='time_stamp', keep='last')
-            # 按 time_stamp 排序
-            combined_df = combined_df.sort_values(by='time_stamp').reset_index(drop=True)
+            combined_df = self._merge_dataframes(exists_df, now_df)
             combined_df.to_csv(exists_file_path, encoding='utf-8', index=False)
-            self.update_files.append(exists_file_path)
+            self.updated_files.append(exists_file_path)
         else:
-            save_path = str(os.path.join(self.pathsAndTable.get_save_path(), key[0], key[1]))
-            copy_to_new(file_path, save_path)
-            self.update_files.append(save_path)
-        trans_print(f"{key[0]}/{key[1]} {'包含' if exists_same else '不包含'} 相同文件,保存成功")
-
-    def combine_and_save_formal_file(self):
-        exists_files = read_excel_files(self.pathsAndTable.get_save_path())
-        exists_file_maps = dict()
-        for file_path in exists_files:
-            name = (os.path.basename(os.path.dirname(file_path)), os.path.basename(file_path))
-            exists_file_maps[name] = file_path
-
-        new_files = read_excel_files(self.pathsAndTable.get_tmp_formal_path())
-        new_file_maps = dict()
-        for file_path in new_files:
-            name = (os.path.basename(os.path.dirname(file_path)), os.path.basename(file_path))
-            new_file_maps[name] = file_path
-
-        same_keys = list(set(exists_file_maps.keys()).intersection(new_file_maps.keys()))
-        split_count = get_available_cpu_count_with_percent(2 / 3)
-        with multiprocessing.Pool(split_count) as pool:
-            pool.starmap(self.combine_and_save,
-                         [(file_path, key, exists_file_maps[key] if key in same_keys else None) for key, file_path in
-                          new_file_maps.items()])
-
-    def run(self):
+            # 复制新文件
+            save_dir = str(os.path.join(
+                self.paths_and_table.get_save_path(),
+                key[0],
+                key[1]
+            ))
+            copy_to_new(file_path, save_dir)
+            self.updated_files.append(save_dir)
+
+        # 记录日志
+        status = "包含" if has_exists else "不包含"
+        debug(f"{key[0]}/{key[1]} {status} 相同文件,保存成功")
+
+    def _build_file_maps(self, base_path: str) -> Dict[Tuple[str, str], str]:
+        """
+        构建文件映射字典
+
+        Args:
+            base_path: 基础路径
+
+        Returns:
+            文件路径映射字典,键为(目录名, 文件名),值为完整路径
+        """
+        files = read_excel_files(base_path)
+        return {
+            (os.path.basename(os.path.dirname(file_path)), os.path.basename(file_path)): file_path
+            for file_path in files
+        }
+
+    def combine_and_save_formal_file(self) -> None:
+        """合并并保存正式文件的主方法"""
+        # 构建已存在文件和新文件的映射
+        exists_file_maps = self._build_file_maps(self.paths_and_table.get_save_path())
+        new_file_maps = self._build_file_maps(self.paths_and_table.get_tmp_formal_path())
+
+        # 找出相同键的文件
+        same_keys = set(exists_file_maps.keys()) & set(new_file_maps.keys())
+
+        # 准备并行处理参数
+        process_args = [
+            (
+                file_path,
+                key,
+                exists_file_maps.get(key) if key in same_keys else None
+            )
+            for key, file_path in new_file_maps.items()
+        ]
+
+        # 使用并行处理
+        cpu_count = get_available_cpu_count_with_percent(ParallelProcessing.CPU_USAGE_PERCENT)
+        cpu_count = min(cpu_count, ParallelProcessing.MAX_PROCESSES)
+        with multiprocessing.Pool(cpu_count) as pool:
+            pool.starmap(self._save_combined_file, process_args)
+
+    def run(self) -> List[str]:
+        """
+        执行合并操作
+
+        Returns:
+            更新后的文件路径列表
+        """
         self.combine_and_save_formal_file()
-        print(self.update_files)
-        return list(self.update_files)
+        info(f"共处理了 {len(self.updated_files)} 个文件")
+        return list(self.updated_files)

+ 96 - 20
etl/common/PathsAndTable.py

@@ -1,14 +1,33 @@
 import shutil
 from os import path, sep
 
+from conf.constants import Paths
 from service.trans_service import creat_min_sec_table, create_warn_fault_table
 from utils.conf.read_conf import *
-from utils.log.trans_log import trans_print
+from utils.log.trans_log import info
 
 
 class PathsAndTable(object):
-    def __init__(self, id=None, task_name=None, read_dir=None, wind_farm_code=None, wind_farm_name=None,
-                 read_type=None, save_db=True, save_zip=True, yaml_config=None, wind_col_trans=None):
+    """路径和表管理类"""
+
+    def __init__(self, id: int = None, task_name: str = None, read_dir: str = None, wind_farm_code: str = None,
+                 wind_farm_name: str = None, read_type: str = None, save_db: bool = True,
+                 save_zip: bool = True, yaml_config: dict = None, wind_col_trans: dict = None):
+        """
+        初始化路径和表管理类
+        
+        Args:
+            id: 任务ID
+            task_name: 任务名称
+            read_dir: 读取目录
+            wind_farm_code: 风电场编码
+            wind_farm_name: 风电场名称
+            read_type: 读取类型
+            save_db: 是否保存到数据库
+            save_zip: 是否保存为压缩文件
+            yaml_config: YAML配置
+            wind_col_trans: 风机列转换映射
+        """
         self.id = id
         self.task_name = task_name
         self.read_dir = read_dir
@@ -25,11 +44,11 @@ class PathsAndTable(object):
 
         self.use_tidb = read_conf(yaml_config, 'use_tidb', False)
 
-        self.tmp_base_path = read_conf(yaml_config, "tmp_base_path", "/tmp")
+        self.tmp_base_path = read_conf(yaml_config, "tmp_base_path", Paths.DEFAULT_TMP_BASE_PATH)
         if save_path_conf:
             self.save_path = save_path_conf + sep + self.wind_farm_name
         else:
-            find_index = read_dir.find(read_conf(yaml_config, 'etl_origin_path_contain', "etl_origin_path_contain"))
+            find_index = read_dir.find(read_conf(yaml_config, 'etl_origin_path_contain', "收资数据"))
             if find_index == -1:
                 raise Exception("路径未包含原始数据特定字符:" + read_dir)
             self.save_path = read_dir[0:find_index] + sep + "清理数据"
@@ -37,48 +56,105 @@ class PathsAndTable(object):
         if self.save_path is None:
             raise Exception("未配置保存路径:" + read_dir)
 
-        self.archive_path = read_conf(yaml_config, "archive_path", "/tmp/archive")
+        self.archive_path = read_conf(yaml_config, "archive_path", Paths.DEFAULT_ARCHIVE_PATH)
 
-    def get_save_path(self):
+    def get_save_path(self) -> str:
+        """
+        获取保存路径
+        
+        Returns:
+            保存路径
+        """
         return path.join(self.save_path, self.read_type)
 
-    def get_tmp_path(self):
+    def get_tmp_path(self) -> str:
+        """
+        获取临时路径
+        
+        Returns:
+            临时路径
+        """
         return str(path.join(self.tmp_base_path, str(self.id) + "_" + self.task_name + "_" + self.read_type))
 
-    def get_excel_tmp_path(self):
+    def get_excel_tmp_path(self) -> str:
+        """
+        获取Excel临时路径
+        
+        Returns:
+            Excel临时路径
+        """
         return path.join(self.get_tmp_path(), 'excel_tmp' + sep)
 
-    def get_read_tmp_path(self):
+    def get_read_tmp_path(self) -> str:
+        """
+        获取读取临时路径
+        
+        Returns:
+            读取临时路径
+        """
         return path.join(self.get_tmp_path(), 'read_tmp')
 
-    def get_merge_tmp_path(self, wind_turbine_number=None):
+    def get_merge_tmp_path(self, wind_turbine_number=None) -> str:
+        """
+        获取合并临时路径
+        
+        Args:
+            wind_turbine_number: 风机编号
+            
+        Returns:
+            合并临时路径
+        """
         if wind_turbine_number is None:
             return path.join(self.get_tmp_path(), 'merge_tmp')
         else:
             return path.join(self.get_tmp_path(), 'merge_tmp', str(wind_turbine_number))
 
-    def get_tmp_formal_path(self):
+    def get_tmp_formal_path(self) -> str:
+        """
+        获取正式临时路径
+        
+        Returns:
+            正式临时路径
+        """
         return path.join(self.get_tmp_path(), 'formal_tmp')
 
-    def get_archive_path(self):
+    def get_archive_path(self) -> str:
+        """
+        获取归档路径
+        
+        Returns:
+            归档路径
+        """
         return path.join(self.archive_path, self.wind_farm_name, self.read_type, f'{self.id}_{self.task_name}')
 
-    def get_table_name(self):
+    def get_table_name(self) -> str:
+        """
+        获取表名
+        
+        Returns:
+            表名
+        """
         return "_".join([self.wind_farm_code, self.read_type])
 
-    def delete_tmp_files(self):
-        trans_print("开始删除临时文件夹")
+    def delete_tmp_files(self) -> None:
+        """
+        删除临时文件
+        """
+        info("开始删除临时文件夹")
         if path.exists(self.get_tmp_path()):
             shutil.rmtree(self.get_tmp_path())
-        trans_print("删除临时文件夹删除成功")
+        info("删除临时文件夹删除成功")
 
-    def create_wind_farm_db(self):
+    def create_wind_farm_db(self) -> None:
+        """
+        创建风电场数据库表
+        """
         if self.save_db:
-            trans_print("开始创建表")
+            info("开始创建表")
             if self.read_type in ['second', 'minute']:
                 creat_min_sec_table(self.get_table_name(), self.read_type, self.wind_farm_name, self.use_tidb)
             elif self.read_type in ['fault', 'warn']:
                 create_warn_fault_table(self.get_table_name(), self.wind_farm_name, )
             else:
                 raise Exception("不支持的读取类型:" + self.read_type)
-            trans_print("建表结束")
+            info("建表结束")

+ 47 - 22
etl/common/SaveToDb.py

@@ -5,8 +5,7 @@ import traceback
 from etl.common.PathsAndTable import PathsAndTable
 from service.trans_conf_service import update_trans_transfer_progress
 from service.trans_service import save_scada_file_to_db, save_file_to_db
-from utils.file.trans_methods import split_array
-from utils.log.trans_log import trans_print
+from utils.log.trans_log import info, error
 from utils.systeminfo.sysinfo import get_available_cpu_count_with_percent
 
 
@@ -25,34 +24,60 @@ class SaveToDb(object):
         all_saved_files = [i for i in all_saved_files if
                            os.path.basename(i).split(".")[0] in self.pathsAndTable.wind_col_trans.keys()]
 
+        if not all_saved_files:
+            info("没有文件需要保存到数据库")
+            return
+
         self.pathsAndTable.create_wind_farm_db()
 
-        split_count = get_available_cpu_count_with_percent(percent=2 / 3)
-        split_count = split_count if split_count <= len(all_saved_files) else len(all_saved_files)
-        all_arrays = split_array(all_saved_files, split_count)
+        # 计算最佳进程数
+        max_processes = get_available_cpu_count_with_percent(percent=2 / 3)
+        max_processes = min(max_processes, len(all_saved_files), 10)  # 限制最大进程数为10
+
         try:
-            for index, arr in enumerate(all_arrays):
-                with multiprocessing.Pool(10) as pool:
-                    if self.pathsAndTable.read_type in ['minute', 'second']:
-                        pool.starmap(save_scada_file_to_db,
-                                     [(self.pathsAndTable.get_table_name(), file,
-                                       self.pathsAndTable.wind_col_trans[os.path.basename(file).split(".")[0]],
-                                       os.path.basename(os.path.dirname(file)),
-                                       self.batch_count,self.pathsAndTable.use_tidb) for file in arr])
-                    else:
-                        pool.starmap(save_file_to_db,
-                                     [(self.pathsAndTable.get_table_name(), file, self.batch_count) for file in arr])
-
-                update_trans_transfer_progress(self.pathsAndTable.id,
-                                               round(70 + 29 * (index + 1) / len(all_arrays), 2),
-                                               self.pathsAndTable.save_db)
+            # 创建一个进程池处理所有文件
+            with multiprocessing.Pool(max_processes) as pool:
+                if self.pathsAndTable.read_type in ['minute', 'second']:
+                    # 准备参数
+                    params = [(self.pathsAndTable.get_table_name(), file,
+                               self.pathsAndTable.wind_col_trans[os.path.basename(file).split(".")[0]],
+                               os.path.basename(os.path.dirname(file)),
+                               self.batch_count, self.pathsAndTable.use_tidb) for file in all_saved_files]
+
+                    # 分批次处理并更新进度
+                    batch_size = max(1, len(params) // 10)  # 最多10个批次
+                    for i in range(0, len(params), batch_size):
+                        batch_params = params[i:i + batch_size]
+                        pool.starmap(save_scada_file_to_db, batch_params)
+
+                        # 更新进度
+                        progress = 70 + 29 * (i + len(batch_params)) / len(params)
+                        update_trans_transfer_progress(self.pathsAndTable.id,
+                                                       round(progress, 2),
+                                                       self.pathsAndTable.save_db)
+
+                else:
+                    # 准备参数
+                    params = [(self.pathsAndTable.get_table_name(), file, self.batch_count) for file in all_saved_files]
+
+                    # 分批次处理并更新进度
+                    batch_size = max(1, len(params) // 10)  # 最多10个批次
+                    for i in range(0, len(params), batch_size):
+                        batch_params = params[i:i + batch_size]
+                        pool.starmap(save_file_to_db, batch_params)
+
+                        # 更新进度
+                        progress = 70 + 29 * (i + len(batch_params)) / len(params)
+                        update_trans_transfer_progress(self.pathsAndTable.id,
+                                                       round(progress, 2),
+                                                       self.pathsAndTable.save_db)
         except Exception as e:
-            trans_print(traceback.format_exc())
+            error(traceback.format_exc())
             message = "保存到数据库错误,系统返回错误:" + str(e)
             raise ValueError(message)
 
     def run(self):
         if self.pathsAndTable.save_db:
             self.mutiprocessing_to_save_db()
-            update_trans_transfer_progress(self.pathsAndTable.id,  99,
+            update_trans_transfer_progress(self.pathsAndTable.id, 99,
                                            self.pathsAndTable.save_db)

+ 43 - 18
etl/common/UnzipAndRemove.py

@@ -1,54 +1,76 @@
 import multiprocessing
+import os
 import traceback
-from os import *
+from typing import List, Optional
 
+from conf.constants import ParallelProcessing
 from etl.common.PathsAndTable import PathsAndTable
 from service.trans_conf_service import update_trans_transfer_progress
 from utils.file.trans_methods import read_files, read_excel_files, copy_to_new, split_array
-from utils.log.trans_log import trans_print
+from utils.log.trans_log import info, error
 from utils.systeminfo.sysinfo import get_available_cpu_count_with_percent
 from utils.zip.unzip import unzip, unrar, get_desc_path
 
 
 class UnzipAndRemove(object):
-    def __init__(self, pathsAndTable: PathsAndTable, filter_types=None):
+    """解压缩并移动文件类"""
+
+    def __init__(self, pathsAndTable: PathsAndTable, filter_types: Optional[List[str]] = None):
+        """
+        初始化解压缩并移动文件类
+        
+        Args:
+            pathsAndTable: 路径和表对象
+            filter_types: 文件类型过滤器
+        """
         self.pathsAndTable = pathsAndTable
         self.filter_types = filter_types
 
-    def get_and_remove(self, file):
-
+    def get_and_remove(self, file: str) -> None:
+        """
+        解压缩或移动文件到临时路径
+        
+        Args:
+            file: 文件路径
+        """
         to_path = self.pathsAndTable.get_excel_tmp_path()
-        if str(file).endswith("zip"):
-            if str(file).endswith("csv.zip"):
+        file_lower = str(file).lower()
+        if file_lower.endswith("zip"):
+            if file_lower.endswith("csv.zip"):
                 copy_to_new(file, file.replace(self.pathsAndTable.read_dir, to_path).replace("csv.zip", 'csv.gz'))
             else:
                 desc_path = file.replace(self.pathsAndTable.read_dir, to_path)
                 unzip(file, get_desc_path(desc_path))
                 self.pathsAndTable.has_zip = True
-        elif str(file).endswith("rar"):
+        elif file_lower.endswith("rar"):
             desc_path = file.replace(self.pathsAndTable.read_dir, to_path)
             is_success, e = unrar(file, get_desc_path(desc_path))
             self.pathsAndTable.has_zip = True
-            if not is_success:
-                trans_print(traceback.format_exc())
-                pass
         else:
             copy_to_new(file, file.replace(self.pathsAndTable.read_dir, to_path))
 
-    def remove_file_to_tmp_path(self):
+    def remove_file_to_tmp_path(self) -> List[str]:
+        """
+        将文件移动到临时路径
+        
+        Returns:
+            处理后的文件列表
+        """
         # 读取文件
         try:
-            if path.isfile(self.pathsAndTable.read_dir):
+            if os.path.isfile(self.pathsAndTable.read_dir):
                 all_files = [self.pathsAndTable.read_dir]
             else:
                 all_files = read_files(self.pathsAndTable.read_dir)
 
             # 最大取系统cpu的 三分之二
             split_count = get_available_cpu_count_with_percent(2 / 3)
+            # 限制最大进程数
+            split_count = min(split_count, ParallelProcessing.MAX_PROCESSES)
             all_arrays = split_array(all_files, split_count)
 
             for index, arr in enumerate(all_arrays):
-                pool_count = split_count if split_count < len(arr) else len(arr)
+                pool_count = min(split_count, len(arr))
                 with multiprocessing.Pool(pool_count) as pool:
                     pool.starmap(self.get_and_remove, [(i,) for i in arr])
                 update_trans_transfer_progress(self.pathsAndTable.id,
@@ -57,14 +79,17 @@ class UnzipAndRemove(object):
 
             all_files = read_excel_files(self.pathsAndTable.get_excel_tmp_path())
 
-            trans_print('读取文件数量:', len(all_files))
+            info('读取文件数量:', len(all_files))
         except Exception as e:
-            trans_print(traceback.format_exc())
+            error(traceback.format_exc())
             message = "读取文件列表错误:" + self.pathsAndTable.read_dir + ",系统返回错误:" + str(e)
             raise ValueError(message)
         return all_files
 
-    def run(self):
+    def run(self) -> None:
+        """
+        运行解压缩和移动文件流程
+        """
         self.remove_file_to_tmp_path()
-        update_trans_transfer_progress(self.pathsAndTable.id,  20,
+        update_trans_transfer_progress(self.pathsAndTable.id, 20,
                                        self.pathsAndTable.save_db)

+ 3 - 3
etl/wind_power/fault_warn/FaultWarnTrans.py

@@ -10,7 +10,7 @@ from service.trans_service import get_fault_warn_conf, drop_table, create_warn_f
     save_file_to_db
 from utils.conf.read_conf import read_conf
 from utils.file.trans_methods import read_excel_files, read_file_to_df, create_file_path, valid_eval
-from utils.log.trans_log import trans_print
+from utils.log.trans_log import info, error
 
 
 class FaultWarnTrans(BaseDataTrans):
@@ -27,14 +27,14 @@ class FaultWarnTrans(BaseDataTrans):
 
     # 第三步 读取 并 保存到临时文件
     def read_and_save_tmp_file(self):
-        trans_print("无需保存临时文件")
+        info("无需保存临时文件")
 
     # 读取并保存到临时正式文件
     def statistics_and_save_tmp_formal_file(self):
         conf_map = self.get_filed_conf()
         if conf_map is None or type(conf_map) == tuple or len(conf_map.keys()) == 0:
             message = f"未找到{self.id}的{self.transfer_type}配置"
-            trans_print(message)
+            error(message)
             update_trans_status_error(self.id, message, self.save_db)
         else:
 

+ 4 - 4
etl/wind_power/laser/LaserTrans.py

@@ -7,11 +7,11 @@ import numpy as np
 import pandas as pd
 
 from service.plt_service import get_all_wind
-from service.trans_service import save_df_to_db
 from service.trans_conf_service import update_trans_status_running, update_trans_transfer_progress, \
     update_trans_status_success
+from service.trans_service import save_df_to_db
 from utils.file.trans_methods import read_files, read_file_to_df
-from utils.log.trans_log import set_trance_id, trans_print
+from utils.log.trans_log import set_trance_id, info
 
 
 class LaserTrans():
@@ -56,7 +56,7 @@ class LaserTrans():
         trance_id = '-'.join([self.wind_farm_code, 'laser'])
         set_trance_id(trance_id)
         all_files = read_files(self.read_path, ['csv'])
-        trans_print(self.wind_farm_code, '获取文件总数为:', len(all_files))
+        info(self.wind_farm_code, '获取文件总数为:', len(all_files))
         pool_count = 8 if len(all_files) > 8 else len(all_files)
 
         with multiprocessing.Pool(pool_count) as pool:
@@ -70,7 +70,7 @@ class LaserTrans():
         update_trans_status_success(self.id, len(df['wind_turbine_number'].unique()), None,
                                     df['acquisition_time'].min(), df['acquisition_time'].max(), df.shape[0])
         # update_trans_status_success(self.id)
-        trans_print(self.wind_farm_code, '执行结束,总耗时:', (datetime.datetime.now() - self.begin))
+        info(self.wind_farm_code, '执行结束,总耗时:', (datetime.datetime.now() - self.begin))
 
 
 if __name__ == '__main__':

+ 6 - 6
etl/wind_power/min_sec/ClassIdentifier.py

@@ -5,7 +5,7 @@ import numpy as np
 from pandas import DataFrame
 
 from utils.file.trans_methods import read_file_to_df
-from utils.log.trans_log import trans_print
+from utils.log.trans_log import error, warning, debug
 
 
 class ClassIdentifier(object):
@@ -35,11 +35,11 @@ class ClassIdentifier(object):
         self.cut_out_speed = cut_out_speed
 
         if self.rated_power is None:
-            trans_print(wind_turbine_number, "WARNING:rated_power配置为空的")
+            warning(wind_turbine_number, "WARNING:rated_power配置为空的")
             self.rated_power = 1500
 
         if self.cut_out_speed is None:
-            trans_print(cut_out_speed, "WARNING:cut_out_speed配置为空的")
+            warning(cut_out_speed, "WARNING:cut_out_speed配置为空的")
             self.cut_out_speed = 20
 
         if file_path is None and origin_df is None:
@@ -350,12 +350,12 @@ class ClassIdentifier(object):
     def run(self):
         # Implement your class identification logic here
         begin = datetime.datetime.now()
-        trans_print("打标签开始,风机号:", self.wind_turbine_number, self.df.shape)
+        debug("打标签开始,风机号:", self.wind_turbine_number, self.df.shape)
         try:
             df = self.identifier()
         except Exception as e:
-            trans_print(traceback.format_exc())
+            error(traceback.format_exc())
             message = str(e) + ',风机编号:' + self.wind_turbine_number
             raise Exception('打标签失败:' + message)
-        trans_print("打标签结束,", df.shape, ",耗时:", datetime.datetime.now() - begin)
+        debug("打标签结束,", df.shape, ",耗时:", datetime.datetime.now() - begin)
         return df

+ 51 - 26
etl/wind_power/min_sec/MinSecTrans.py

@@ -3,6 +3,7 @@
 # @Author  : 魏志亮
 import multiprocessing
 import os.path
+from typing import Optional
 
 from etl.common.BaseDataTrans import BaseDataTrans
 from etl.common.CombineAndSaveFormalFile import CombineAndSaveFormalFile
@@ -12,26 +13,67 @@ from etl.wind_power.min_sec.TransParam import TransParam
 from service.trans_conf_service import update_trans_status_success, update_trans_status_error
 from service.trans_service import get_min_sec_conf
 from utils.conf.read_conf import read_conf
-from utils.log.trans_log import trans_print
+from utils.log.trans_log import error
 
 
 class MinSecTrans(BaseDataTrans):
+    """分钟/秒级数据转换类"""
 
-    def __init__(self, data: dict = None, save_db=True, yaml_config=None, step=0, end=999):
+    # 转换列名列表
+    TRANS_COLS = [
+        'wind_turbine_number', 'time_stamp', 'active_power', 'rotor_speed', 'generator_speed',
+        'wind_velocity', 'pitch_angle_blade_1', 'pitch_angle_blade_2', 'pitch_angle_blade_3',
+        'cabin_position', 'true_wind_direction', 'yaw_error1', 'set_value_of_active_power',
+        'gearbox_oil_temperature', 'generatordrive_end_bearing_temperature',
+        'generatornon_drive_end_bearing_temperature', 'wind_turbine_status', 'wind_turbine_status2',
+        'cabin_temperature', 'twisted_cable_angle', 'front_back_vibration_of_the_cabin',
+        'side_to_side_vibration_of_the_cabin', 'actual_torque', 'given_torque',
+        'clockwise_yaw_count', 'counterclockwise_yaw_count', 'unusable', 'power_curve_available',
+        'required_gearbox_speed',
+        'inverter_speed_master_control', 'outside_cabin_temperature', 'main_bearing_temperature',
+        'main_bearing_temperature_2', 'gearbox_high_speed_shaft_bearing_temperature',
+        'gearboxmedium_speed_shaftbearing_temperature',
+        'gearbox_low_speed_shaft_bearing_temperature', 'generator_winding1_temperature',
+        'generator_winding2_temperature', 'generator_winding3_temperature',
+        'turbulence_intensity', 'grid_a_phase_current', 'grid_b_phase_current',
+        'grid_c_phase_current', 'reactive_power', 'param1', 'param2', 'param3', 'param4', 'param5',
+        'param6', 'param7', 'param8', 'param9', 'param10'
+    ]
+
+    def __init__(self, data: dict = None, save_db: bool = True, yaml_config: dict = None, step: int = 0,
+                 end: int = 999):
+        """
+        初始化分钟/秒级数据转换类
+        
+        Args:
+            data: 任务数据字典
+            save_db: 是否保存到数据库
+            yaml_config: YAML配置
+            step: 开始步骤
+            end: 结束步骤
+        """
         super(MinSecTrans, self).__init__(data, save_db, yaml_config, step, end)
         self.statistics_map = multiprocessing.Manager().dict()
         self.trans_param = self.get_trans_param()
         self.trans_param.wind_col_trans = self.wind_col_trans
 
     def get_filed_conf(self):
+        """获取配置"""
         return get_min_sec_conf(self.wind_farm_code, self.transfer_type)
 
-    def get_trans_param(self):
+    def get_trans_param(self) -> Optional[TransParam]:
+        """
+        获取转换参数
+        
+        Returns:
+            TransParam对象
+        """
         conf_map = self.get_filed_conf()
         if conf_map is None or type(conf_map) == tuple or len(conf_map.keys()) == 0:
             message = f"未找到{self.id}的{self.transfer_type}配置"
-            trans_print(message)
+            error(message)
             update_trans_status_error(self.id, message, self.save_db)
+            return None
         else:
             resolve_col_prefix = read_conf(conf_map, 'resolve_col_prefix')
             wind_name_exec = read_conf(conf_map, 'wind_name_exec', None)
@@ -48,25 +90,7 @@ class MinSecTrans(BaseDataTrans):
             boolean_sec_to_min = int(boolean_sec_to_min) == 1
 
             cols_trans_all = dict()
-            trans_cols = ['wind_turbine_number', 'time_stamp', 'active_power', 'rotor_speed', 'generator_speed',
-                          'wind_velocity', 'pitch_angle_blade_1', 'pitch_angle_blade_2', 'pitch_angle_blade_3',
-                          'cabin_position', 'true_wind_direction', 'yaw_error1', 'set_value_of_active_power',
-                          'gearbox_oil_temperature', 'generatordrive_end_bearing_temperature',
-                          'generatornon_drive_end_bearing_temperature', 'wind_turbine_status', 'wind_turbine_status2',
-                          'cabin_temperature', 'twisted_cable_angle', 'front_back_vibration_of_the_cabin',
-                          'side_to_side_vibration_of_the_cabin', 'actual_torque', 'given_torque',
-                          'clockwise_yaw_count', 'counterclockwise_yaw_count', 'unusable', 'power_curve_available',
-                          'required_gearbox_speed',
-                          'inverter_speed_master_control', 'outside_cabin_temperature', 'main_bearing_temperature',
-                          'main_bearing_temperature_2', 'gearbox_high_speed_shaft_bearing_temperature',
-                          'gearboxmedium_speed_shaftbearing_temperature',
-                          'gearbox_low_speed_shaft_bearing_temperature', 'generator_winding1_temperature',
-                          'generator_winding2_temperature', 'generator_winding3_temperature',
-                          'turbulence_intensity', 'grid_a_phase_current', 'grid_b_phase_current',
-                          'grid_c_phase_current', 'reactive_power', 'param1', 'param2', 'param3', 'param4', 'param5',
-                          'param6', 'param7', 'param8', 'param9', 'param10']
-
-            for col in trans_cols:
+            for col in self.TRANS_COLS:
                 cols_trans_all[col] = read_conf(conf_map, col, '')
 
             return TransParam(read_type=self.transfer_type, read_path=self.read_dir,
@@ -77,13 +101,13 @@ class MinSecTrans(BaseDataTrans):
                               resolve_col_prefix=resolve_col_prefix, need_valid_cols=need_valid_cols,
                               boolean_sec_to_min=boolean_sec_to_min)
 
-    # 第三步 读取 并 保存到临时文件
     def read_and_save_tmp_file(self):
+        """第三步:读取并保存到临时文件"""
         read_and_save_tmp = ReadAndSaveTmp(self.pathsAndTable, self.trans_param)
         read_and_save_tmp.run()
 
-    # 第四步 统计 并 保存到正式文件
     def statistics_and_save_tmp_formal_file(self):
+        """第四步:统计并保存到正式文件"""
         # 保存到正式文件
         statistics_and_save_tmp_formal_file = StatisticsAndSaveTmpFormalFile(self.pathsAndTable, self.trans_param,
                                                                              self.statistics_map,
@@ -91,11 +115,12 @@ class MinSecTrans(BaseDataTrans):
         statistics_and_save_tmp_formal_file.run()
 
     def combine_and_save_formal_file(self):
+        """合并并保存正式文件"""
         combine_and_save_formal_file = CombineAndSaveFormalFile(self.pathsAndTable)
         self.update_files = combine_and_save_formal_file.run()
 
-    # 最后更新执行程度
     def update_exec_progress(self):
+        """最后更新执行进度"""
         all_files = set([os.path.basename(i) for i in self.update_files])
         update_trans_status_success(self.id, len(all_files),
                                     self.statistics_map['time_granularity'],

+ 111 - 45
etl/wind_power/min_sec/ReadAndSaveTmp.py

@@ -1,31 +1,47 @@
 import datetime
 import multiprocessing
+import os
 import traceback
-from os import *
 
 import pandas as pd
 
+from conf.constants import ParallelProcessing
 from etl.common.PathsAndTable import PathsAndTable
 from etl.wind_power.min_sec import TransParam
 from service.trans_conf_service import update_trans_transfer_progress
 from utils.file.trans_methods import read_excel_files, split_array, del_blank, \
     create_file_path, read_file_to_df, valid_eval
-from utils.log.trans_log import trans_print
+from utils.log.trans_log import info, debug, error
 from utils.systeminfo.sysinfo import use_files_get_max_cpu_count, get_dir_size
 
 
 class ReadAndSaveTmp(object):
+    """读取并保存临时文件类"""
 
     def __init__(self, pathsAndTable: PathsAndTable, trans_param: TransParam):
+        """
+        初始化读取并保存临时文件类
+        
+        Args:
+            pathsAndTable: 路径和表对象
+            trans_param: 转换参数对象
+        """
         self.pathsAndTable = pathsAndTable
         self.trans_param = trans_param
         self.exist_wind_names = multiprocessing.Manager().list()
         self.lock = multiprocessing.Manager().Lock()
         self.file_lock = multiprocessing.Manager().dict()
 
-    def _save_to_tmp_csv_by_name(self, df, name):
+    def _save_to_tmp_csv_by_name(self, df: pd.DataFrame, name: str):
+        """
+        根据风机名称保存到临时CSV文件
+        
+        Args:
+            df: 数据帧
+            name: 风机名称
+        """
         save_name = str(name) + '.csv'
-        save_path = path.join(self.pathsAndTable.get_read_tmp_path(), save_name)
+        save_path = os.path.join(self.pathsAndTable.get_read_tmp_path(), save_name)
         create_file_path(save_path, is_file_path=True)
 
         with self.lock:
@@ -41,7 +57,13 @@ class ReadAndSaveTmp(object):
             else:
                 df.to_csv(save_path, index=False, encoding='utf8')
 
-    def save_merge_data(self, file_path):
+    def save_merge_data(self, file_path: str):
+        """
+        保存合并数据
+        
+        Args:
+            file_path: 文件路径
+        """
         df = self.read_excel_to_df(file_path)
         if self.trans_param.wind_name_exec:
             if valid_eval(self.trans_param.wind_name_exec):
@@ -67,7 +89,7 @@ class ReadAndSaveTmp(object):
                         else:
                             contains_name = False
                             self.exist_wind_names.append(exist_name)
-                        save_path = path.join(merge_path, csv_name)
+                        save_path = os.path.join(merge_path, csv_name)
                         now_df = df[df['wind_turbine_number'] == wind_name][['time_stamp', col]]
                         if contains_name:
                             now_df.to_csv(save_path, index=False, encoding='utf-8', mode='a',
@@ -75,7 +97,16 @@ class ReadAndSaveTmp(object):
                         else:
                             now_df.to_csv(save_path, index=False, encoding='utf-8')
 
-    def trans_df_cols(self, df):
+    def trans_df_cols(self, df: pd.DataFrame) -> pd.DataFrame:
+        """
+        转换数据帧列名
+        
+        Args:
+            df: 数据帧
+        
+        Returns:
+            转换后的数据帧
+        """
         if self.trans_param.is_vertical_table:
             pass
         else:
@@ -120,8 +151,13 @@ class ReadAndSaveTmp(object):
 
         return df
 
-    def df_save_to_tmp_file(self, df=pd.DataFrame()):
-
+    def df_save_to_tmp_file(self, df: pd.DataFrame = pd.DataFrame()):
+        """
+        保存数据帧到临时文件
+        
+        Args:
+            df: 数据帧
+        """
         df = self.trans_df_cols(df)
 
         df = del_blank(df, ['wind_turbine_number'])
@@ -133,19 +169,34 @@ class ReadAndSaveTmp(object):
 
         self.save_to_tmp_csv(df)
 
-    def save_to_tmp_csv(self, df):
+    def save_to_tmp_csv(self, df: pd.DataFrame):
+        """
+        保存到临时CSV文件
+        
+        Args:
+            df: 数据帧
+        """
         names = set(df['wind_turbine_number'].values)
         if names:
-            trans_print("开始保存", str(names), "到临时文件", df.shape)
+            debug("开始保存", str(names), "到临时文件", df.shape)
 
             for name in names:
                 self._save_to_tmp_csv_by_name(df[df['wind_turbine_number'] == name], name)
             del df
-            trans_print("保存", str(names), "到临时文件成功, 风机数量", len(names))
-
-    def merge_df(self, dir_path):
+            debug("保存", str(names), "到临时文件成功, 风机数量", len(names))
+
+    def merge_df(self, dir_path: str) -> pd.DataFrame:
+        """
+        合并数据帧
+        
+        Args:
+            dir_path: 目录路径
+        
+        Returns:
+            合并后的数据帧
+        """
         all_files = read_excel_files(dir_path)
-        wind_turbine_number = path.basename(dir_path)
+        wind_turbine_number = os.path.basename(dir_path)
         df = pd.DataFrame()
         for file in all_files:
             now_df = read_file_to_df(file)
@@ -161,8 +212,13 @@ class ReadAndSaveTmp(object):
         return df
 
     def read_file_and_save_tmp(self):
+        """
+        读取文件并保存到临时文件
+        """
         all_files = read_excel_files(self.pathsAndTable.get_excel_tmp_path())
         split_count = use_files_get_max_cpu_count(all_files)
+        # 限制最大进程数
+        split_count = min(split_count, ParallelProcessing.MAX_PROCESSES)
         all_arrays = split_array(all_files, split_count)
 
         if self.trans_param.merge_columns:
@@ -172,7 +228,7 @@ class ReadAndSaveTmp(object):
                         pool.starmap(self.save_merge_data, [(ar,) for ar in arr])
 
                 except Exception as e:
-                    trans_print(traceback.format_exc())
+                    error(traceback.format_exc())
                     message = "整理临时文件,系统返回错误:" + str(e)
                     raise ValueError(message)
 
@@ -180,28 +236,28 @@ class ReadAndSaveTmp(object):
                                                round(20 + 20 * (index + 1) / len(all_arrays), 2),
                                                self.pathsAndTable.save_db)
 
-            dirs = [path.join(self.pathsAndTable.get_merge_tmp_path(), dir_name) for dir_name in
-                    listdir(self.pathsAndTable.get_merge_tmp_path())]
-            dir_total_size = get_dir_size(dirs[0])
-            # split_count = max_file_size_get_max_cpu_count(dir_total_size, memory_percent=1 / 12, cpu_percent=1 / 10)
-            split_count = 2
-            all_arrays = split_array(dirs, split_count)
-            for index, arr in enumerate(all_arrays):
-                try:
-                    with multiprocessing.Pool(split_count) as pool:
-                        pool.starmap(self.merge_df, [(ar,) for ar in arr])
-
-                except Exception as e:
-                    trans_print(traceback.format_exc())
-                    message = "整理临时文件,系统返回错误:" + str(e)
-                    raise ValueError(message)
-
-                update_trans_transfer_progress(self.pathsAndTable.id,
-                                               round(20 + 30 * (index + 1) / len(all_arrays), 2),
-                                               self.pathsAndTable.save_db)
+            dirs = [os.path.join(self.pathsAndTable.get_merge_tmp_path(), dir_name) for dir_name in
+                    os.listdir(self.pathsAndTable.get_merge_tmp_path())]
+            if dirs:
+                dir_total_size = get_dir_size(dirs[0])
+                # 限制最大进程数
+                split_count = min(dir_total_size, ParallelProcessing.MAX_PROCESSES)
+                all_arrays = split_array(dirs, split_count)
+                for index, arr in enumerate(all_arrays):
+                    try:
+                        with multiprocessing.Pool(split_count) as pool:
+                            pool.starmap(self.merge_df, [(ar,) for ar in arr])
+
+                    except Exception as e:
+                        error(traceback.format_exc())
+                        message = "整理临时文件,系统返回错误:" + str(e)
+                        raise ValueError(message)
+
+                    update_trans_transfer_progress(self.pathsAndTable.id,
+                                                   round(20 + 30 * (index + 1) / len(all_arrays), 2),
+                                                   self.pathsAndTable.save_db)
 
         else:
-
             for index, arr in enumerate(all_arrays):
                 try:
                     with multiprocessing.Pool(split_count) as pool:
@@ -209,7 +265,7 @@ class ReadAndSaveTmp(object):
                     for df in dfs:
                         self.df_save_to_tmp_file(df)
                 except Exception as e:
-                    trans_print(traceback.format_exc())
+                    error(traceback.format_exc())
                     message = "整理临时文件,系统返回错误:" + str(e)
                     raise ValueError(message)
 
@@ -217,8 +273,16 @@ class ReadAndSaveTmp(object):
                                                round(20 + 30 * (index + 1) / len(all_arrays), 2),
                                                self.pathsAndTable.save_db)
 
-    def read_excel_to_df(self, file_path):
-
+    def read_excel_to_df(self, file_path: str) -> pd.DataFrame:
+        """
+        读取Excel文件到数据帧
+        
+        Args:
+            file_path: 文件路径
+        
+        Returns:
+            数据帧
+        """
         read_cols = [v.split(",")[0] for k, v in self.trans_param.cols_tran.items() if v and not v.startswith("$")]
 
         trans_dict = {}
@@ -300,7 +364,7 @@ class ReadAndSaveTmp(object):
 
             for k, v in trans_dict.items():
                 if k.startswith("$file"):
-                    file = ".".join(path.basename(file_path).split(".")[0:-1])
+                    file = ".".join(os.path.basename(file_path).split(".")[0:-1])
                     if k == "$file":
                         ks = k.split("|")
                         bool_contains = False
@@ -337,7 +401,7 @@ class ReadAndSaveTmp(object):
                     datas = str(k.split(",")[1].replace("$file_date", "").replace("[", "").replace("]", "")).split(":")
                     if len(datas) != 2:
                         raise Exception("字段映射出现错误 :" + str(trans_dict))
-                    file = ".".join(path.basename(file_path).split(".")[0:-1])
+                    file = ".".join(os.path.basename(file_path).split(".")[0:-1])
                     date_str = str(file[int(datas[0]):int(datas[1])]).strip()
                     df[v] = df[k.split(",")[0]].apply(lambda x: date_str + " " + str(x))
 
@@ -351,8 +415,8 @@ class ReadAndSaveTmp(object):
                     if not bool_contains:
                         cengshu = int(str(ks[0].replace("$folder", "").replace("[", "").replace("]", "")))
                         for i in range(cengshu):
-                            folder = path.dirname(folder)
-                        df[v] = str(str(folder).split(sep)[-1]).strip()
+                            folder = os.path.dirname(folder)
+                        df[v] = str(str(folder).split(os.sep)[-1]).strip()
                 elif k.startswith("$sheet_name"):
                     df[v] = df['sheet_name']
 
@@ -374,9 +438,11 @@ class ReadAndSaveTmp(object):
             return df
 
     def run(self):
-        trans_print("开始保存数据到临时文件")
+        """
+        """
+        info("开始保存数据到临时文件")
         begin = datetime.datetime.now()
         self.read_file_and_save_tmp()
         update_trans_transfer_progress(self.pathsAndTable.id, 50,
                                        self.pathsAndTable.save_db)
-        trans_print("保存数据到临时文件结束,耗时:", datetime.datetime.now() - begin)
+        info("保存数据到临时文件结束,耗时:", datetime.datetime.now() - begin)

+ 53 - 40
etl/wind_power/min_sec/StatisticsAndSaveTmpFormalFile.py

@@ -5,15 +5,16 @@ from os import path
 import numpy as np
 import pandas as pd
 
+from conf.constants import DataProcessing, ParallelProcessing, Types
 from etl.common.PathsAndTable import PathsAndTable
 from etl.wind_power.min_sec import TransParam
 from etl.wind_power.min_sec.ClassIdentifier import ClassIdentifier
 from etl.wind_power.min_sec.FilterValidData import FilterValidData
 from service.trans_conf_service import update_trans_transfer_progress
 from utils.conf.read_conf import read_conf
-from utils.df_utils.util import get_time_space
-from utils.file.trans_methods import create_file_path, read_excel_files, read_file_to_df, split_array
-from utils.log.trans_log import trans_print
+from utils.df_utils.util import estimate_time_interval as get_time_space
+from utils.file.trans_methods import create_file_path, read_excel_files, read_file_to_df
+from utils.log.trans_log import debug, error
 from utils.systeminfo.sysinfo import use_files_get_max_cpu_count
 
 exec("import math")
@@ -74,13 +75,12 @@ class StatisticsAndSaveTmpFormalFile(object):
             self.trans_param.wind_col_trans).fillna(df['wind_turbine_number'])
         wind_col_name = str(df['wind_turbine_number'].values[0])
 
-        not_double_cols = ['wind_turbine_number', 'wind_turbine_name', 'time_stamp', 'param6', 'param7', 'param8',
-                           'param9', 'param10']
+        not_double_cols = DataProcessing.NOT_DOUBLE_COLS
 
         # 删除 有功功率 和 风速均为空的情况
         df.dropna(subset=['active_power', 'wind_velocity'], how='any', inplace=True)
-        trans_print(origin_wind_name, wind_col_name, "删除有功功率和风速有空的情况后:", df.shape)
-        df.replace(np.nan, -999999999, inplace=True)
+        debug(origin_wind_name, wind_col_name, "删除有功功率和风速有空的情况后:", df.shape)
+        df.replace(np.nan, DataProcessing.NAN_REPLACE_VALUE, inplace=True)
         number_cols = df.select_dtypes(include=['number']).columns.tolist()
         for col in df.columns:
             if col not in not_double_cols and col not in number_cols:
@@ -88,8 +88,8 @@ class StatisticsAndSaveTmpFormalFile(object):
                     df[col] = pd.to_numeric(df[col], errors='coerce')
                     # 删除包含NaN的行(即那些列A转换失败的行)
                     df = df.dropna(subset=[col])
-                    trans_print(origin_wind_name, wind_col_name, "删除非数值列名:", col)
-        df.replace(-999999999, np.nan, inplace=True)
+                    debug(origin_wind_name, wind_col_name, "删除非数值列名:", col)
+        df.replace(DataProcessing.NAN_REPLACE_VALUE, np.nan, inplace=True)
 
         df.drop_duplicates(['wind_turbine_number', 'time_stamp'], keep='first', inplace=True)
 
@@ -102,40 +102,40 @@ class StatisticsAndSaveTmpFormalFile(object):
         # 删除每行有空值的行(2025-3-24)
         # origin_count = df.shape[0]
         # df = df.dropna()
-        # trans_print(f'原始数据量:{origin_count},去除na后数据量:{df.shape[0]}')
+        # trans_print(f"原始数据量:{origin_count},去除na后数据量:{df.shape[0]}")
 
         # 如果秒级有可能合并到分钟级
         # TODO add 秒转分钟
         if self.trans_param.boolean_sec_to_min:
             df['time_stamp'] = df['time_stamp'].apply(lambda x: x + pd.Timedelta(minutes=(10 - x.minute % 10) % 10))
-            df['time_stamp'] = df['time_stamp'].dt.floor('10T')
+            df['time_stamp'] = df['time_stamp'].dt.floor(DataProcessing.TIME_INTERVAL)
             df = df.groupby(['wind_turbine_number', 'time_stamp']).mean().reset_index()
-        trans_print('有功功率前10个', df.head(10)['active_power'].values)
+        debug('有功功率前10个', df.head(10)['active_power'].values)
         power_df = df[df['active_power'] > 0]
-        trans_print(origin_wind_name, wind_col_name, "功率大于0的数量:", power_df.shape)
+        debug(origin_wind_name, wind_col_name, "功率大于0的数量:", power_df.shape)
         power = power_df.sample(int(power_df.shape[0] / 100))['active_power'].median()
 
-        trans_print(origin_wind_name, wind_col_name, '有功功率,中位数', power)
-        if power > 100000:
+        debug(origin_wind_name, wind_col_name, '有功功率,中位数', power)
+        if power > DataProcessing.POWER_UNIT_THRESHOLD:
             df['active_power'] = df['active_power'] / 1000
-        ## 做数据检测前,羡强行处理有功功率
+        # 做数据检测前,羡强行处理有功功率
         # df = df[df['active_power'] < 50000]
 
         rated_power_and_cutout_speed_tuple = read_conf(self.rated_power_and_cutout_speed_map, str(wind_col_name))
         if rated_power_and_cutout_speed_tuple is None:
-            rated_power_and_cutout_speed_tuple = (None, None)
-            trans_print(origin_wind_name, '未从平台匹配到额定功率')
+            # rated_power_and_cutout_speed_tuple = (None, None)
+            error(origin_wind_name, '未从平台匹配到额定功率')
         else:
-            trans_print(origin_wind_name, '过滤数据前数据大小', df.shape)
-            trans_print(origin_wind_name, '额定功率', rated_power_and_cutout_speed_tuple[0])
+            debug(origin_wind_name, '过滤数据前数据大小', df.shape)
+            debug(origin_wind_name, '额定功率', rated_power_and_cutout_speed_tuple[0])
             # trans_print(origin_wind_name, '\n', df.head(10))
             filter_valid_data = FilterValidData(df, rated_power_and_cutout_speed_tuple[0])
             try:
                 df = filter_valid_data.run()
             except:
-                trans_print(origin_wind_name, '过滤数据异常', filename)
+                error(origin_wind_name, '过滤数据异常', filename)
                 raise
-            trans_print(origin_wind_name, '过滤数据后数据大小', df.shape)
+            debug(origin_wind_name, '过滤数据后数据大小', df.shape)
 
             # 如果有需要处理的,先进行代码处理,在进行打标签
             # exec_code = get_trans_exec_code(self.paths_and_table.exec_id, self.paths_and_table.read_type)
@@ -147,10 +147,10 @@ class StatisticsAndSaveTmpFormalFile(object):
             if power_df.shape[0] == 0:
                 df.loc[:, 'lab'] = -1
             else:
-                class_identifiler = ClassIdentifier(wind_turbine_number=origin_wind_name, origin_df=df,
-                                                    rated_power=rated_power_and_cutout_speed_tuple[0],
-                                                    cut_out_speed=rated_power_and_cutout_speed_tuple[1])
-                df = class_identifiler.run()
+                class_identifier = ClassIdentifier(wind_turbine_number=origin_wind_name, origin_df=df,
+                                                   rated_power=rated_power_and_cutout_speed_tuple[0],
+                                                   cut_out_speed=rated_power_and_cutout_speed_tuple[1])
+                df = class_identifier.run()
 
             del power_df
 
@@ -163,7 +163,7 @@ class StatisticsAndSaveTmpFormalFile(object):
             df['year_month'] = df[['year', 'month']].apply(lambda x: str(x['year']) + str(x['month']).zfill(2), axis=1)
             cols = df.columns
 
-            if self.paths_and_table.read_type == 'second':
+            if self.paths_and_table.read_type == Types.SECOND:
                 type_col = 'year_month'
             else:
                 type_col = 'year'
@@ -185,29 +185,42 @@ class StatisticsAndSaveTmpFormalFile(object):
             self.set_statistics_data(df)
 
             del df
-            trans_print("保存" + str(wind_col_name) + "成功")
+            debug("保存" + str(wind_col_name) + "成功")
 
-    def mutiprocessing_to_save_file(self):
+    def multiprocessing_to_save_file(self):
         # 开始保存到正式文件
         all_tmp_files = read_excel_files(self.paths_and_table.get_read_tmp_path())
-        # split_count = self.pathsAndTable.multi_pool_count
-        split_count = use_files_get_max_cpu_count(all_tmp_files)
-        all_arrays = split_array(all_tmp_files, split_count)
+
+        if not all_tmp_files:
+            debug("没有临时文件需要处理")
+            return
+
+        # 计算最佳进程数
+        max_processes = use_files_get_max_cpu_count(all_tmp_files)
+        max_processes = min(max_processes, len(all_tmp_files), ParallelProcessing.MAX_PROCESSES)  # 限制最大进程数
 
         try:
-            for index, arr in enumerate(all_arrays):
-                with multiprocessing.Pool(split_count) as pool:
-                    pool.starmap(self.save_to_csv, [(i,) for i in arr])
-                update_trans_transfer_progress(self.paths_and_table.id,
-                                               round(50 + 15 * (index + 1) / len(all_arrays), 2),
-                                               self.paths_and_table.save_db)
+            # 创建一个进程池处理所有文件
+            with multiprocessing.Pool(max_processes) as pool:
+                # 分批次处理并更新进度
+                batch_size = max(1, len(all_tmp_files) // ParallelProcessing.MAX_BATCHES)  # 最多10个批次
+
+                for i in range(0, len(all_tmp_files), batch_size):
+                    batch_files = all_tmp_files[i:i + batch_size]
+                    pool.starmap(self.save_to_csv, [(file,) for file in batch_files])
+
+                    # 更新进度
+                    progress = 50 + 15 * (i + len(batch_files)) / len(all_tmp_files)
+                    update_trans_transfer_progress(self.paths_and_table.id,
+                                                   round(progress, 2),
+                                                   self.paths_and_table.save_db)
 
         except Exception as e:
-            trans_print(traceback.format_exc())
+            error(traceback.format_exc())
             message = "保存文件错误,系统返回错误:" + str(e)
             raise ValueError(message)
 
     def run(self):
-        self.mutiprocessing_to_save_file()
+        self.multiprocessing_to_save_file()
         update_trans_transfer_progress(self.paths_and_table.id, 65,
                                        self.paths_and_table.save_db)

+ 42 - 7
etl/wind_power/min_sec/TransParam.py

@@ -1,23 +1,58 @@
 # -*- coding: utf-8 -*-
 # @Time    : 2024/5/16
 # @Author  : 魏志亮
+from typing import Optional, Dict, List
 
 
 class TransParam(object):
+    """转换参数类
+    
+    存储数据转换过程中的各种参数配置
+    """
 
-    def __init__(self, read_type=None, read_path=None, cols_tran=dict(),
-                 wind_name_exec=str(), is_vertical_table=False, vertical_cols=list(), vertical_key=None,
-                 vertical_value=None, index_cols=list(), merge_columns=False, resolve_col_prefix=None,
-                 need_valid_cols=True, wind_col_trans: dict = None, boolean_sec_to_min=False):
+    def __init__(self, 
+                 read_type: Optional[str] = None, 
+                 read_path: Optional[str] = None, 
+                 cols_tran: Dict[str, str] = None,
+                 wind_name_exec: str = "", 
+                 is_vertical_table: bool = False, 
+                 vertical_cols: List[str] = None,
+                 vertical_key: Optional[str] = None,
+                 vertical_value: Optional[str] = None, 
+                 index_cols: List[str] = None, 
+                 merge_columns: bool = False, 
+                 resolve_col_prefix: Optional[str] = None,
+                 need_valid_cols: bool = True, 
+                 wind_col_trans: Optional[Dict[str, str]] = None, 
+                 boolean_sec_to_min: bool = False):
+        """
+        初始化转换参数
+        
+        Args:
+            read_type: 读取类型,如 'second' 或 'minute'
+            read_path: 读取路径
+            cols_tran: 列名转换映射
+            wind_name_exec: 风机名称处理表达式
+            is_vertical_table: 是否为垂直表
+            vertical_cols: 垂直表列名列表
+            vertical_key: 垂直表键列
+            vertical_value: 垂直表值列
+            index_cols: 索引列列表
+            merge_columns: 是否合并列
+            resolve_col_prefix: 列名前缀解析表达式
+            need_valid_cols: 是否需要验证列
+            wind_col_trans: 风机列转换映射
+            boolean_sec_to_min: 是否将秒级数据转换为分钟级
+        """
         self.read_type = read_type
         self.read_path = read_path
-        self.cols_tran = cols_tran
+        self.cols_tran = cols_tran or {}
         self.is_vertical_table = is_vertical_table
         self.wind_name_exec = wind_name_exec
-        self.vertical_cols = vertical_cols
+        self.vertical_cols = vertical_cols or []
         self.vertical_key = vertical_key
         self.vertical_value = vertical_value
-        self.index_cols = index_cols
+        self.index_cols = index_cols or []
         self.merge_columns = merge_columns
         self.resolve_col_prefix = resolve_col_prefix
         self.need_valid_cols = need_valid_cols

+ 53 - 18
etl/wind_power/wave/WaveTrans.py

@@ -1,14 +1,16 @@
 import json
 import multiprocessing
 import traceback
+from typing import Tuple
 
+from conf.constants import ParallelProcessing, Types
 from service.plt_service import get_all_wind
 from service.trans_conf_service import update_trans_status_running, update_trans_transfer_progress, \
     update_trans_status_success, update_trans_status_error
 from service.trans_service import get_wave_conf, save_df_to_db, get_or_create_wave_table, \
     get_wave_data, delete_exist_wave_data
 from utils.file.trans_methods import *
-from utils.log.trans_log import set_trance_id
+from utils.log.trans_log import set_trance_id, info, error
 from utils.systeminfo.sysinfo import get_available_cpu_count_with_percent
 
 exec("from os.path import *")
@@ -16,8 +18,17 @@ exec("import re")
 
 
 class WaveTrans(object):
-
-    def __init__(self, id, wind_farm_code, read_dir):
+    """波形数据转换类"""
+
+    def __init__(self, id: int, wind_farm_code: str, read_dir: str):
+        """
+        初始化波形数据转换类
+        
+        Args:
+            id: 任务ID
+            wind_farm_code: 风电场编码
+            read_dir: 读取目录
+        """
         self.id = id
         self.wind_farm_code = wind_farm_code
         self.read_dir = read_dir
@@ -28,11 +39,28 @@ class WaveTrans(object):
         self.max_date = None
         self.data_count = 0
 
-    def get_data_exec(self, func_code, filepath, measupoint_names: set):
+    def get_data_exec(self, func_code: str, filepath: str, measupoint_names: List[str]) -> Optional[Tuple]:
+        """
+        执行数据获取函数
+        
+        Args:
+            func_code: 函数代码
+            filepath: 文件路径
+            measupoint_names: 测量点名称列表
+        
+        Returns:
+            数据元组
+        """
         exec(func_code)
         return locals()['get_data'](filepath, measupoint_names)
 
-    def del_exists_data(self, df):
+    def del_exists_data(self, df: pd.DataFrame):
+        """
+        删除已存在的数据
+        
+        Args:
+            df: 数据帧
+        """
         min_date, max_date = df['time_stamp'].min(), df['time_stamp'].max()
         db_df = get_wave_data(self.wind_farm_code + '_wave', min_date, max_date)
 
@@ -44,13 +72,17 @@ class WaveTrans(object):
             delete_exist_wave_data(self.wind_farm_code + "_wave", ids)
 
     def run(self):
+        """运行波形数据转换"""
         update_trans_status_running(self.id)
         trance_id = '-'.join([self.wind_farm_code, 'wave'])
         set_trance_id(trance_id)
         all_files = read_files(self.read_dir, ['txt', 'csv'])
         update_trans_transfer_progress(self.id, 5)
+
         # 最大取系统cpu的 1/2
         split_count = get_available_cpu_count_with_percent(1 / 2)
+        # 限制最大进程数
+        split_count = min(split_count, ParallelProcessing.MAX_PROCESSES)
 
         all_wind, _ = get_all_wind(self.wind_farm_code, False)
 
@@ -58,11 +90,11 @@ class WaveTrans(object):
 
         wave_conf = get_wave_conf(self.wind_farm_code)
 
-        base_param_exec = wave_conf['base_param_exec']
+        base_param_exec = wave_conf.get('base_param_exec', '')
         map_dict = {}
         if base_param_exec:
             base_param_exec = base_param_exec.replace('\r\n', '\n').replace('\t', '    ')
-            trans_print(base_param_exec)
+            info(base_param_exec)
             if 'import ' in base_param_exec:
                 raise Exception("方法不支持import方法")
 
@@ -72,23 +104,26 @@ class WaveTrans(object):
 
         wind_turbine_name_set = set()
 
-        all_array = split_array(all_files, split_count * 10)
+        # 优化批次大小
+        batch_size = split_count * 10
+        all_array = split_array(all_files, batch_size)
         total_index = len(all_array)
+
         for index, now_array in enumerate(all_array):
             index_begin = datetime.datetime.now()
             with multiprocessing.Pool(split_count) as pool:
                 try:
                     file_datas = pool.starmap(self.get_data_exec,
                                               [(base_param_exec, i, list(map_dict.keys())) for i in now_array])
-                    trans_print(f'总数:{len(now_array)},返回个数{len(file_datas)}')
+                    info(f'总数:{len(now_array)},返回个数{len(file_datas)}')
                 except Exception as e:
                     message = str(e)
-                    trans_print(traceback.format_exc())
+                    error(traceback.format_exc())
                     update_trans_status_error(self.id, message[0:len(message) if len(message) < 100 else 100])
                     raise e
 
             update_trans_transfer_progress(self.id, 20 + int(index / total_index * 60))
-            trans_print("读取文件耗时:", datetime.datetime.now() - self.begin)
+            info("读取文件耗时:", datetime.datetime.now() - self.begin)
 
             result_list = list()
             for file_data in file_datas:
@@ -96,7 +131,7 @@ class WaveTrans(object):
                     wind_turbine_name, time_stamp, sampling_frequency, rotational_speed, mesure_point_name, type, mesure_data = \
                         file_data[0], file_data[1], file_data[2], file_data[3], file_data[4], file_data[5], file_data[6]
 
-                    if mesure_point_name in map_dict.keys():
+                    if mesure_point_name in map_dict:
                         wind_turbine_name_set.add(wind_turbine_name)
                         if self.min_date is None or self.min_date > time_stamp:
                             self.min_date = time_stamp
@@ -109,7 +144,7 @@ class WaveTrans(object):
                              mesure_data])
 
             if result_list:
-                self.data_count = self.data_count + len(result_list)
+                self.data_count += len(result_list)
                 df = pd.DataFrame(result_list,
                                   columns=['wind_turbine_name', 'time_stamp', 'rotational_speed', 'sampling_frequency',
                                            'mesure_point_name', 'type', 'mesure_data'])
@@ -118,16 +153,16 @@ class WaveTrans(object):
                 df.dropna(subset=['mesure_point_name'], inplace=True)
                 df['wind_turbine_number'] = df['wind_turbine_name'].map(all_wind).fillna(df['wind_turbine_name'])
 
+                # 批量处理JSON序列化
                 df['mesure_data'] = df['mesure_data'].apply(lambda x: json.dumps(x))
 
                 df.sort_values(by=['time_stamp', 'mesure_point_name'], inplace=True)
                 # self.del_exists_data(df)
                 save_df_to_db(self.wind_farm_code + '_wave', df, batch_count=400)
-            trans_print(f"总共{total_index}组,当前{index + 1}", "本次写入耗时:", datetime.datetime.now() - index_begin,
-                        "总耗时:", datetime.datetime.now() - self.begin)
+            info(f"总共{total_index}组,当前{index + 1}", "本次写入耗时:", datetime.datetime.now() - index_begin,
+                 "总耗时:", datetime.datetime.now() - self.begin)
 
-        update_trans_status_success(self.id, len(wind_turbine_name_set), None,
+        update_trans_status_success(self.id, len(wind_turbine_name_set), Types.WAVE,
                                     self.min_date, self.max_date, self.data_count)
 
-        # update_trans_status_success(self.id)
-        trans_print("总耗时:", datetime.datetime.now() - self.begin)
+        info("总耗时:", datetime.datetime.now() - self.begin)

+ 3 - 3
service/common_connect.py

@@ -1,5 +1,5 @@
-from utils.db.ConnectMysql import ConnectMysql
+from utils.db.ConnectMysql import MySQLDatabase
 
-plt = ConnectMysql("plt")
+plt = MySQLDatabase("plt")
 
-trans = ConnectMysql("trans")
+trans = MySQLDatabase("trans")

+ 6 - 2
service/trans_conf_service.py

@@ -4,6 +4,7 @@
 from datetime import datetime
 
 from service.common_connect import trans
+from utils.log.trans_log import info
 
 
 def update_timeout_trans_data():
@@ -46,6 +47,7 @@ def update_trans_status_error(id, message="", save_db=True):
 
         message = message if len(message) <= 200 else message[0:200]
         trans.execute(exec_sql, (message, id))
+    info("执行失败:", message)
 
 
 def update_trans_status_success(id, wind_count=0, time_granularity=0,
@@ -70,14 +72,16 @@ def update_trans_status_success(id, wind_count=0, time_granularity=0,
             trans.execute(exec_sql, (wind_count, time_granularity, id))
 
 
-def update_trans_transfer_progress(id,  transfer_progress=0, save_db=True):
-    print(id,  transfer_progress)
+def update_trans_transfer_progress(id, transfer_progress=0, save_db=True):
+    print(id, transfer_progress)
     if save_db:
         exec_sql = """
         update data_transfer set transfer_progress =%s where id = %s 
         """
         trans.execute(exec_sql, (int(transfer_progress), id))
 
+    info('当前进度:', transfer_progress)
+
 
 def get_now_running_count():
     query_running_sql = """

+ 95 - 72
service/trans_service.py

@@ -9,53 +9,65 @@ import pandas as pd
 from service.common_connect import trans
 from service.trans_conf_service import create_wave_table
 from utils.file.trans_methods import split_array
-from utils.log.trans_log import trans_print
+from utils.log.trans_log import info, error
 
 
-def get_min_sec_conf(field_code, trans_type) -> dict:
-    query_sql = "SELECT * FROM trans_conf where wind_code = %s and type = %s and status = 1"
-    res = trans.execute(query_sql, (field_code, trans_type))
+def get_config(table_name, field_code, trans_type=None, field_name='wind_code', status=1) -> dict:
+    """
+    通用配置获取函数
+    
+    Args:
+        table_name: 表名
+        field_code: 字段值
+        trans_type: 类型参数
+        field_name: 字段名,默认为wind_code
+        status: 状态值,默认为1
+        
+    Returns:
+        配置字典
+    """
+    if table_name == 'warn_fault_conf':
+        types = list()
+        if trans_type == 'fault':
+            types.append(1)
+        elif trans_type == 'warn':
+            types.append(2)
+        else:
+            error(f"未找到{trans_type}告警/故障的配置")
+            raise ValueError(f"未找到{trans_type}告警/故障的配置")
+        types.append(3)
+        query_sql = f"SELECT * FROM {table_name} where {field_name} = %s and type in %s and status = %s"
+        params = (field_code, types, status)
+    elif table_name == 'trans_conf' and field_name == 'wind_name':
+        query_sql = f"SELECT * FROM {table_name} where {field_name} = %s and type = %s and status = %s"
+        params = (field_code, trans_type, status)
+    elif table_name == 'trans_conf':
+        query_sql = f"SELECT * FROM {table_name} where {field_name} = %s and type = %s and status = %s"
+        params = (field_code, trans_type, status)
+    else:
+        query_sql = f"SELECT * FROM {table_name} where {field_name} = %s and status = %s"
+        params = (field_code, status)
+
+    res = trans.execute(query_sql, params)
     if type(res) == tuple or type(res) == str:
         return None
     return res[0]
 
 
-def get_min_sec_conf_test(field_code, trans_type) -> dict:
-    query_sql = "SELECT * FROM trans_conf where wind_name = %s and type = %s and status = 1"
-    res = trans.execute(query_sql, (field_code, trans_type))
-    print(res)
-    if type(res) == tuple or type(res) == str:
-        return None
-    return res[0]
+def get_min_sec_conf(field_code, trans_type) -> dict:
+    return get_config('trans_conf', field_code, trans_type)
 
 
-def get_fault_warn_conf(field_code, trans_type) -> dict:
-    types = list()
-    if trans_type == 'fault':
-        types.append(1)
-    elif trans_type == 'warn':
-        types.append(2)
-    else:
-        trans_print(f"未找到{trans_type}告警/故障的配置")
-        raise ValueError(f"未找到{trans_type}告警/故障的配置")
+def get_min_sec_conf_test(field_code, trans_type) -> dict:
+    return get_config('trans_conf', field_code, trans_type, field_name='wind_name')
 
-    types.append(3)
 
-    query_sql = "SELECT * FROM warn_fault_conf where wind_code = %s and type in %s and status = 1"
-    res = trans.execute(query_sql, (field_code, types))
-    print(res)
-    if type(res) == tuple or type(res) == str:
-        return None
-    return res[0]
+def get_fault_warn_conf(field_code, trans_type) -> dict:
+    return get_config('warn_fault_conf', field_code, trans_type)
 
 
 def get_wave_conf(field_code) -> dict:
-    query_sql = "SELECT * FROM wave_conf where wind_code = %s and status = 1"
-    res = trans.execute(query_sql, (field_code))
-    print(res)
-    if type(res) == tuple or type(res) == str:
-        return None
-    return res[0]
+    return get_config('wave_conf', field_code)
 
 
 def creat_min_sec_table(table_name, trans_type, wind_farm_name='', use_tidb=False):
@@ -64,7 +76,7 @@ def creat_min_sec_table(table_name, trans_type, wind_farm_name='', use_tidb=Fals
     """
     count = trans.execute(exists_table_sql)[0]['count']
     if count > 0:
-        trans_print(f"{table_name}已存在")
+        info(f"{table_name}已存在")
 
     if trans_type == 'second':
         add_key = 'KEY `year_month` (`year_month`)'
@@ -197,52 +209,63 @@ def drop_exists_data(table_name, wind_turbine_number, min_date, max_date):
     """
 
     count = trans.execute(sql)
-    trans_print(f"删除数据{count}条,{table_name},{wind_turbine_number},{min_date},{max_date}")
-
+    info(f"删除数据{count}条,{table_name},{wind_turbine_number},{min_date},{max_date}")
 
-def save_scada_file_to_db(table_name, file: str, wind_turbine_number, date_str, batch_count=100000, use_tidb=False):
-    base_name = path.basename(file)
-    df = pd.read_csv(file)
-    # if use_tidb:
-    #     min_date = df['time_stamp'].min()
-    #     max_date = df['time_stamp'].max()
-    #     # drop_exists_data(table_name, wind_turbine_number, min_date, max_date)
-    # else:
-    #     add_or_remove_partation(table_name, date_str, wind_turbine_number)
-
-    add_or_remove_partation(table_name, date_str, wind_turbine_number)
 
+def save_data_to_db(table_name: str, data, batch_count=100000, wind_turbine_number=None, date_str=None, file_name=None):
+    """
+    通用数据保存函数
+    
+    Args:
+        table_name: 表名
+        data: 数据,可以是DataFrame或文件路径
+        batch_count: 批处理大小
+        wind_turbine_number: 风机编号
+        date_str: 日期字符串
+        file_name: 文件名
+        
+    Returns:
+        None
+    """
     try:
-        trans_print(f"保存{table_name},{base_name},{wind_turbine_number},数据:{df.shape[0]}")
-        trans.execute_df_save(df, table_name, batch_count)
-        trans_print(f"保存到{table_name},{base_name},{wind_turbine_number} 成功,总条数:{df.shape[0]}")
+        # 处理数据
+        if isinstance(data, str):
+            # 从文件读取数据
+            df = pd.read_csv(data)
+            file_name = file_name or path.basename(data)
+        else:
+            # 直接使用DataFrame
+            df = data
+
+        # 处理分区
+        if wind_turbine_number and date_str:
+            add_or_remove_partation(table_name, date_str, wind_turbine_number)
+
+        # 保存数据
+        if wind_turbine_number:
+            trans.execute_df_save(df, table_name, batch_count)
+            info(f"保存到{table_name},{file_name},{wind_turbine_number} 成功,总条数:{df.shape[0]}")
+        else:
+            trans.execute_df_save(df, table_name, batch_count)
+            info(f"保存到{table_name}成功,总条数:{df.shape[0]}")
     except Exception as e:
-        trans_print(traceback.format_exc())
-        message = base_name + str(e)
+        if file_name:
+            message = file_name + str(e)
+        else:
+            message = str(e)
         raise Exception(message)
 
 
+def save_scada_file_to_db(table_name, file: str, wind_turbine_number, date_str, batch_count=100000, use_tidb=False):
+    save_data_to_db(table_name, file, batch_count, wind_turbine_number, date_str)
+
+
 def save_file_to_db(table_name: str, file: str, batch_count=100000):
-    base_name = path.basename(file)
-    try:
-        df = pd.read_csv(file)
-        trans_print(f"保存{table_name},总条数:{df.shape[0]}")
-        trans.execute_df_save(df, table_name, batch_count)
-        trans_print(f"保存到{table_name}成功,总条数:{df.shape[0]}")
-    except Exception as e:
-        trans_print(traceback.format_exc())
-        message = base_name + str(e)
-        raise Exception(message)
+    save_data_to_db(table_name, file, batch_count)
 
 
-def save_df_to_db(table_name: str, df: pd.DataFrame(), batch_count=100000):
-    try:
-        trans_print(f"保存{table_name},总条数:{df.shape[0]}")
-        trans.execute_df_save(df, table_name, batch_count)
-        trans_print(f"保存到{table_name}成功,总条数:{df.shape[0]}")
-    except Exception as e:
-        trans_print(traceback.format_exc())
-        raise Exception(str(e))
+def save_df_to_db(table_name: str, df: pd.DataFrame, batch_count=100000):
+    save_data_to_db(table_name, df, batch_count)
 
 
 def batch_statistics(table_name):
@@ -251,7 +274,7 @@ def batch_statistics(table_name):
         res = trans.execute(query_sql)
         return res[0]
     except:
-        trans_print(traceback.format_exc())
+        error(traceback.format_exc())
         return None
 
 
@@ -319,7 +342,7 @@ def get_trans_exec_code(id, query_type):
     if type(res) == tuple or type(res) == str:
         return None
     exec_code = res[0]['exec_code']
-    trans_print("任务ID", id, '类型', type, '获取到执行代码:', exec_code)
+    info("任务ID", id, '类型', type, '获取到执行代码:', exec_code)
     return exec_code
 
 

+ 4 - 2
utils/common.py

@@ -1,3 +1,5 @@
-excel_types = ['xls', 'xlsx', 'xlsm', 'xlsb', 'odf', 'ods', 'csv', 'csv.gz']
+from conf.constants import FileTypes
 
-zip_types = ['rar', 'zip']
+excel_types = FileTypes.EXCEL_TYPES
+
+zip_types = FileTypes.ZIP_TYPES

+ 137 - 12
utils/conf/read_conf.py

@@ -1,22 +1,147 @@
 # -*- coding: utf-8 -*-
 # @Time    : 2024/6/7
 # @Author  : 魏志亮
+import os
 
 import yaml
+from typing import Any, Optional, Dict
 
 
-def yaml_conf(path, encoding='utf-8'):
-    with open(path, 'r', encoding=encoding) as f:
-        data = yaml.safe_load(f)
-    return data
+def load_yaml_config(file_path: str, encoding: str = 'utf-8') -> Dict[str, Any]:
+    """
+    加载YAML配置文件
+    
+    Args:
+        file_path: YAML文件路径
+        encoding: 文件编码,默认为utf-8
+        
+    Returns:
+        解析后的配置字典
+        
+    Raises:
+        FileNotFoundError: 文件不存在时抛出
+        yaml.YAMLError: YAML解析错误时抛出
+    """
+    try:
+        with open(file_path, 'r', encoding=encoding) as f:
+            data = yaml.safe_load(f)
+            # 确保返回字典类型,防止YAML文件为空时返回None
+            return data if isinstance(data, dict) else {}
+    except FileNotFoundError:
+        raise FileNotFoundError(f"配置文件不存在: {file_path}")
+    except yaml.YAMLError as e:
+        raise yaml.YAMLError(f"YAML解析错误: {e}")
 
 
-def read_conf(dict_conf, col, default_value=None):
-    if col in dict_conf:
-        res = dict_conf[col]
-        if res is None and default_value is not None:
-            return default_value
-        return res
-    else:
-        return default_value
+def get_config_value(config: Dict[str, Any], key: str, default: Optional[Any] = None) -> Any:
+    """
+    从配置字典中安全地获取值
+    
+    Args:
+        config: 配置字典
+        key: 配置键名
+        default: 默认值,当键不存在或值为None时返回
+        
+    Returns:
+        配置值或默认值
+    """
+    # 处理config为None的情况
+    if config is None:
+        return default
+    
+    # 支持嵌套键,如 "database.host"
+    keys = key.split('.')
+    value = config
+    
+    for k in keys:
+        if isinstance(value, dict) and k in value:
+            value = value[k]
+        else:
+            value = None
+            break
+    
+    # 如果值为None且提供了默认值,返回默认值
+    if value is None and default is not None:
+        return default
+    
+    return value
 
+
+def merge_configs(base_config: Dict[str, Any], override_config: Dict[str, Any]) -> Dict[str, Any]:
+    """
+    合并配置字典
+    
+    Args:
+        base_config: 基础配置
+        override_config: 覆盖配置
+        
+    Returns:
+        合并后的配置
+    """
+    result = base_config.copy()
+    
+    for key, value in override_config.items():
+        if key in result and isinstance(result[key], dict) and isinstance(value, dict):
+            # 递归合并嵌套字典
+            result[key] = merge_configs(result[key], value)
+        else:
+            # 直接覆盖
+            result[key] = value
+    
+    return result
+
+
+def load_config_with_env(file_path: str, encoding: str = 'utf-8') -> Dict[str, Any]:
+    """
+    加载配置文件并支持环境变量覆盖
+    
+    Args:
+        file_path: YAML文件路径
+        encoding: 文件编码,默认为utf-8
+        
+    Returns:
+        解析后的配置字典
+    """
+    # 加载基础配置
+    base_config = load_yaml_config(file_path, encoding)
+    
+    # 检查是否有环境变量覆盖
+    env_prefix = "ETL_"
+    override_config = {}
+    
+    for key, value in os.environ.items():
+        if key.startswith(env_prefix):
+            # 转换环境变量名到配置键名
+            config_key = key[len(env_prefix):].lower().replace('_', '.')
+            
+            # 解析值
+            if value.lower() == 'true':
+                parsed_value = True
+            elif value.lower() == 'false':
+                parsed_value = False
+            elif value.isdigit():
+                parsed_value = int(value)
+            elif '.' in value and all(part.isdigit() for part in value.split('.')):
+                parsed_value = float(value)
+            else:
+                parsed_value = value
+            
+            # 构建嵌套配置
+            keys = config_key.split('.')
+            current = override_config
+            for k in keys[:-1]:
+                if k not in current:
+                    current[k] = {}
+                current = current[k]
+            current[keys[-1]] = parsed_value
+    
+    # 合并配置
+    if override_config:
+        base_config = merge_configs(base_config, override_config)
+    
+    return base_config
+
+
+# 为了保持向后兼容,保留原函数名(可选)
+yaml_conf = load_yaml_config
+read_conf = get_config_value

+ 231 - 41
utils/db/ConnectMysql.py

@@ -1,56 +1,246 @@
+# -*- coding: utf-8 -*-
+# @Time    : 2024/6/7
+# @Author  : 魏志亮
+
+import os
 import traceback
-from os import *
+from typing import Any, Dict, List, Tuple, Union
 
 import pandas as pd
 import pymysql
 from pymysql.cursors import DictCursor
 from sqlalchemy import create_engine
+from sqlalchemy.engine import Engine
+
+from utils.conf.read_conf import load_yaml_config
+from utils.log.trans_log import error, info, debug
+
+
+class MySQLDatabase:
+    """MySQL数据库连接管理类"""
+
+    # 类级别的引擎缓存,避免重复创建
+    _engine_cache = {}
+
+    def __init__(self, connection_name: str):
+        """
+        初始化MySQL数据库连接
+        
+        Args:
+            connection_name: 配置文件中对应的连接名称
+        """
+        # 获取配置文件路径
+        config_path = os.environ.get('ETL_CONF')
+        if not config_path:
+            raise ValueError("环境变量 ETL_CONF 未设置")
+
+        # 加载配置
+        self.yaml_data = load_yaml_config(config_path)
+        self.connection_name = connection_name
+
+        # 验证配置是否存在
+        if connection_name not in self.yaml_data:
+            raise KeyError(f"配置中不存在连接名称: {connection_name}")
+
+        self.config = self.yaml_data[connection_name]
+        self.database = self.config.get('database', '')
+
+        # 验证必要配置项
+        required_keys = ['host', 'user', 'password', 'database']
+        missing_keys = [key for key in required_keys if key not in self.config]
+        if missing_keys:
+            raise KeyError(f"连接配置缺少必要项: {missing_keys}")
+
+    def get_connection(self) -> pymysql.Connection:
+        """
+        从连接池中获取一个连接
+        
+        Returns:
+            pymysql连接对象
+        """
+        # 创建连接配置副本,避免修改原配置
+        conn_config = self.config.copy()
+        # 移除可能不需要的配置项(如果有)
+        conn_config.pop('charset', None)  # pymysql连接时charset参数可能会冲突
+
+        return pymysql.connect(
+            cursorclass=DictCursor,
+            charset='utf8mb4',
+            **conn_config
+        )
+
+    def execute_query(self, sql: str, params: Union[Tuple, List, Dict] = None) -> List[Dict[str, Any]]:
+        """
+        执行SQL查询并返回结果
+        
+        Args:
+            sql: SQL语句
+            params: SQL参数,可以是元组、列表或字典
+            
+        Returns:
+            查询结果列表,每个元素为字典形式
+            
+        Raises:
+            Exception: SQL执行错误时抛出
+        """
+        params = params or ()
+        conn = None
+        cursor = None
+
+        try:
+            conn = self.get_connection()
+            cursor = conn.cursor()
+
+            # 执行SQL
+            cursor.execute(sql, params)
+            debug("开始执行SQL:\n", cursor._executed)
 
-from utils.conf.read_conf import yaml_conf
-from utils.log.trans_log import trans_print
+            # 提交事务
+            conn.commit()
 
+            # 获取结果
+            result = cursor.fetchall()
+            return result
 
-class ConnectMysql:
+        except Exception as e:
+            error(f"执行SQL出错: {sql}")
+            error(f"错误信息: {e}")
+            error(traceback.format_exc())
 
-    def __init__(self, connet_name):
-        self.yaml_data = yaml_conf(environ.get('ETL_CONF'))
-        self.connet_name = connet_name
-        self.config = self.yaml_data[self.connet_name]
-        self.database = self.config['database']
+            if conn:
+                conn.rollback()
+            raise e
 
-    # 从连接池中获取一个连接
-    def get_conn(self):
-        return pymysql.connect(**self.config)
+        finally:
+            # 确保资源被释放
+            if cursor:
+                cursor.close()
+            if conn:
+                conn.close()
 
-    # 使用连接执行sql
-    def execute(self, sql, params=tuple()):
+    def execute_update(self, sql: str, params: Union[Tuple, List, Dict] = None) -> int:
+        """
+        执行更新操作(INSERT, UPDATE, DELETE)
+        
+        Args:
+            sql: SQL语句
+            params: SQL参数
+            
+        Returns:
+            影响的行数
+        """
+        params = params or ()
+        conn = None
+        cursor = None
 
-        with self.get_conn() as conn:
-            with conn.cursor(cursor=DictCursor) as cursor:
-                try:
-                    cursor.execute(sql, params)
-                    trans_print("开始执行SQL:", cursor._executed)
-                    conn.commit()
-                    result = cursor.fetchall()
-                    return result
-                except Exception as e:
-                    trans_print(f"执行sql:{sql},报错:{e}")
-                    trans_print(traceback.format_exc())
-                    conn.rollback()
-                    raise e
+        try:
+            conn = self.get_connection()
+            cursor = conn.cursor()
 
-    def get_engine(self):
+            cursor.execute(sql, params)
+            debug("开始执行SQL:", cursor._executed)
+
+            conn.commit()
+            return cursor.rowcount
+
+        except Exception as e:
+            error(f"执行更新SQL出错: {sql}")
+            error(f"错误信息: {e}")
+            error(traceback.format_exc())
+
+            if conn:
+                conn.rollback()
+            raise e
+
+        finally:
+            if cursor:
+                cursor.close()
+            if conn:
+                conn.close()
+
+    def get_engine(self) -> Engine:
+        """
+        获取SQLAlchemy引擎,使用缓存避免重复创建
+        
+        Returns:
+            SQLAlchemy引擎对象
+        """
+        # 构建缓存键
         config = self.config
-        username = config['user']
-        password = config['password']
-        host = config['host']
-        port = config['port']
-        dbname = config['database']
-        return create_engine(f'mysql+pymysql://{username}:{password}@{host}:{port}/{dbname}')
-
-    def execute_df_save(self, df, table_name, chunk_size=10000):
-        df.to_sql(table_name, self.get_engine(), index=False, if_exists='append', chunksize=chunk_size)
-
-    def read_sql_to_df(self, sql):
-        df = pd.read_sql_query(sql, self.get_engine())
-        return df
+        cache_key = f"{config['host']}:{config['port']}:{config['user']}:{config['database']}"
+
+        # 检查缓存中是否已有引擎
+        if cache_key not in self._engine_cache:
+            username = config['user']
+            password = config['password']
+            host = config['host']
+            port = config['port']
+            dbname = config['database']
+
+            # 构建连接URL
+            connection_url = f'mysql+pymysql://{username}:{password}@{host}:{port}/{dbname}?charset=utf8mb4'
+
+            # 创建引擎并缓存
+            self._engine_cache[cache_key] = create_engine(
+                connection_url,
+                pool_size=10,  # 增加连接池大小
+                pool_recycle=3600,
+                pool_pre_ping=True,  # 连接池预ping,确保连接有效
+                echo=False  # 设置为True可打印SQL日志
+            )
+
+        return self._engine_cache[cache_key]
+
+    def save_dataframe(self, df: pd.DataFrame, table_name: str, chunk_size: int = 10000,
+                       if_exists: str = 'append') -> None:
+        """
+        将DataFrame保存到数据库表
+        
+        Args:
+            df: pandas DataFrame对象
+            table_name: 目标表名
+            chunk_size: 每批写入的行数
+            if_exists: 表存在时的处理方式:'fail', 'replace', 'append'
+        """
+        try:
+            df.to_sql(
+                table_name,
+                self.get_engine(),
+                index=False,
+                if_exists=if_exists,
+                chunksize=chunk_size,
+                method='multi'  # 使用多值插入提高性能
+            )
+            info(f"成功保存 {len(df)} 条数据到表 {table_name}")
+
+        except Exception as e:
+            error(f"保存DataFrame到表 {table_name} 失败: {e}")
+            error(traceback.format_exc())
+            raise e
+
+    def read_sql_to_dataframe(self, sql: str) -> pd.DataFrame:
+        """
+        执行SQL查询并返回DataFrame
+        
+        Args:
+            sql: SQL查询语句
+            
+        Returns:
+            查询结果的DataFrame
+        """
+        try:
+            df = pd.read_sql_query(sql, self.get_engine())
+            debug(f"查询返回 {len(df)} 行数据")
+            return df
+
+        except Exception as e:
+            error(f"执行SQL查询失败: {sql}")
+            error(f"错误信息: {e}")
+            error(traceback.format_exc())
+            raise e
+
+    # 为了保持向后兼容,保留原方法名(可选)
+    get_conn = get_connection
+    execute = execute_query
+    execute_df_save = save_dataframe
+    read_sql_to_df = read_sql_to_dataframe

+ 6 - 6
utils/db/ConnectMysql_tidb_fix.py

@@ -8,7 +8,7 @@ from pymysql.cursors import DictCursor
 from sqlalchemy import create_engine
 
 from utils.conf.read_conf import yaml_conf
-from utils.log.trans_log import trans_print
+from utils.log.trans_log import error, debug
 
 
 class ConnectMysql:
@@ -30,13 +30,13 @@ class ConnectMysql:
             with conn.cursor(cursor=DictCursor) as cursor:
                 try:
                     cursor.execute(sql, params)
-                    trans_print("开始执行SQL:", cursor._executed)
+                    debug("开始执行SQL:", cursor._executed)
                     conn.commit()
                     result = cursor.fetchall()
                     return result
                 except Exception as e:
-                    trans_print(f"执行sql:{sql},报错:{e}")
-                    trans_print(traceback.format_exc())
+                    error(f"执行sql:{sql},报错:{e}")
+                    error(traceback.format_exc())
                     conn.rollback()
                     raise e
 
@@ -66,10 +66,10 @@ class ConnectMysql:
                     df.to_sql(table_name, engine, if_exists='append', index=False, chunksize=chunksize)
                 except Exception as e:
                     retry_count += 1
-                    trans_print(f" 第 {retry_count} 次重试, 错误: {str(e)}")
+                    error(f" 第 {retry_count} 次重试, 错误: {str(e)}")
                     time.sleep(5 * retry_count)  # 指数退避
                     if retry_count == max_retries:
-                        trans_print(f"处理失败: {str(e)}")
+                        error(f"处理失败: {str(e)}")
                         raise
         except Exception as e:
             engine.dispose()

+ 2 - 3
utils/df_utils/util.py

@@ -6,7 +6,7 @@ import datetime
 import pandas as pd
 
 
-def get_time_space(df, time_str):
+def estimate_time_interval(df, time_str):
     """
     :return: 查询时间间隔
     """
@@ -15,7 +15,6 @@ def get_time_space(df, time_str):
     df1['chazhi'] = df1[time_str].shift(-1) - df1[time_str]
     result = df1.sample(int(df1.shape[0] / 100))['chazhi'].value_counts().idxmax().seconds
     del df1
-    print(datetime.datetime.now() - begin)
     return result
 
 
@@ -46,7 +45,7 @@ def calculate_time_difference(now: datetime.datetime, date: datetime.datetime):
 if __name__ == '__main__':
     df = pd.read_csv(r"D:\data\清理数据\密马风电场\test_11_test\minute\WOG00469.csv")
     df['time_stamp'] = pd.to_datetime(df['time_stamp'])
-    space = get_time_space(df, 'time_stamp')
+    space = estimate_time_interval(df, 'time_stamp')
     min = df['time_stamp'].min()
     max = df['time_stamp'].max()
     result = get_time_space_count(min, max, space)

+ 139 - 38
utils/file/trans_methods.py

@@ -6,25 +6,35 @@ import datetime
 import os
 import shutil
 import warnings
+from typing import List, Dict, Optional
 
 import chardet
 import pandas as pd
 
-from utils.common import excel_types, zip_types
-from utils.log.trans_log import trans_print
+from conf.constants import FileTypes
+from utils.log.trans_log import error, debug
 
 warnings.filterwarnings("ignore")
 
 
 # 获取文件编码
-def detect_file_encoding(filename):
+def detect_file_encoding(filename: str) -> str:
+    """
+    检测文件编码
+    
+    Args:
+        filename: 文件路径
+    
+    Returns:
+        检测到的编码
+    """
     # 读取文件的前1000个字节(足够用于大多数编码检测)
     with open(filename, 'rb') as f:
         rawdata = f.read(1000)
     result = chardet.detect(rawdata)
     encoding = result['encoding']
 
-    trans_print("文件类型:", filename, encoding)
+    debug("文件类型:", filename, encoding)
 
     if encoding is None:
         encoding = 'gb18030'
@@ -35,19 +45,52 @@ def detect_file_encoding(filename):
     return 'gb18030'
 
 
-def del_blank(df=pd.DataFrame(), cols=list()):
+def del_blank(df: pd.DataFrame = pd.DataFrame(), cols: Optional[List[str]] = None) -> pd.DataFrame:
+    """
+    删除指定列的空白字符
+    
+    Args:
+        df: 数据帧
+        cols: 要处理的列列表
+    
+    Returns:
+        处理后的数据帧
+    """
+    if cols is None:
+        cols = []
     for col in cols:
-        if df[col].dtype == object:
+        if col in df.columns and df[col].dtype == object:
             df[col] = df[col].str.strip()
     return df
 
 
 # 切割数组到多个数组
-def split_array(array, num):
+def split_array(array: List, num: int) -> List[List]:
+    """
+    将数组切割成多个子数组
+    
+    Args:
+        array: 原始数组
+        num: 每个子数组的长度
+    
+    Returns:
+        子数组列表
+    """
     return [array[i:i + num] for i in range(0, len(array), num)]
 
 
-def find_read_header(file_path, trans_cols, resolve_col_prefix=None):
+def find_read_header(file_path: str, trans_cols: List[str], resolve_col_prefix: Optional[str] = None) -> Optional[int]:
+    """
+    查找文件的表头行
+    
+    Args:
+        file_path: 文件路径
+        trans_cols: 要匹配的列名列表
+        resolve_col_prefix: 列名前缀解析表达式
+    
+    Returns:
+        表头行索引
+    """
     df = read_file_to_df(file_path, nrows=20)
     df.reset_index(inplace=True)
     count = 0
@@ -59,7 +102,7 @@ def find_read_header(file_path, trans_cols, resolve_col_prefix=None):
 
     for col in trans_cols:
         if col in df_cols:
-            count = count + 1
+            count += 1
             if count >= 2:
                 header = 0
                 break
@@ -73,7 +116,7 @@ def find_read_header(file_path, trans_cols, resolve_col_prefix=None):
             values = row.values
         for col in trans_cols:
             if col in values:
-                count = count + 1
+                count += 1
                 if count > 2:
                     header = index + 1
                     return header
@@ -82,30 +125,44 @@ def find_read_header(file_path, trans_cols, resolve_col_prefix=None):
 
 
 # 读取数据到df
-def read_file_to_df(file_path, read_cols=list(), trans_cols=None, nrows=None, not_find_header='raise',
-                    resolve_col_prefix=None):
+def read_file_to_df(file_path: str, read_cols: Optional[List[str]] = None, trans_cols: Optional[List[str]] = None,
+                    nrows: Optional[int] = None, not_find_header: str = 'raise',
+                    resolve_col_prefix: Optional[str] = None) -> pd.DataFrame:
+    """
+    读取文件到数据帧
+    
+    Args:
+        file_path: 文件路径
+        read_cols: 要读取的列列表
+        trans_cols: 要匹配的列名列表
+        nrows: 读取的行数
+        not_find_header: 未找到表头时的处理方式
+        resolve_col_prefix: 列名前缀解析表达式
+    
+    Returns:
+        读取的数据帧
+    """
     begin = datetime.datetime.now()
-    trans_print('开始读取文件', file_path)
+    debug('开始读取文件', file_path)
     header = 0
-    find_cols = list()
     if trans_cols:
         header = find_read_header(file_path, trans_cols, resolve_col_prefix)
-        trans_print(os.path.basename(file_path), "读取第", header, "行")
+        debug(os.path.basename(file_path), "读取第", header, "行")
         if header is None:
             if not_find_header == 'raise':
                 message = '未匹配到开始行,请检查并重新指定'
-                trans_print(message)
+                debug(message)
                 raise Exception(message)
             elif not_find_header == 'ignore':
                 pass
 
-    # read_cols.extend(find_cols)
     df = pd.DataFrame()
     if header is not None:
         try:
-            if str(file_path).lower().endswith("csv") or str(file_path).lower().endswith("gz"):
+            file_path_lower = str(file_path).lower()
+            if file_path_lower.endswith("csv") or file_path_lower.endswith("gz"):
                 encoding = detect_file_encoding(file_path)
-                end_with_gz = str(file_path).lower().endswith("gz")
+                end_with_gz = file_path_lower.endswith("gz")
                 if read_cols:
                     if end_with_gz:
                         df = pd.read_csv(file_path, encoding=encoding, usecols=read_cols, compression='gzip',
@@ -115,7 +172,6 @@ def read_file_to_df(file_path, read_cols=list(), trans_cols=None, nrows=None, no
                         df = pd.read_csv(file_path, encoding=encoding, usecols=read_cols, header=header,
                                          on_bad_lines='warn', nrows=nrows)
                 else:
-
                     if end_with_gz:
                         df = pd.read_csv(file_path, encoding=encoding, compression='gzip', header=header, nrows=nrows)
                     else:
@@ -135,16 +191,25 @@ def read_file_to_df(file_path, read_cols=list(), trans_cols=None, nrows=None, no
                     now_df['sheet_name'] = sheet_name
                     df = pd.concat([df, now_df])
                 xls.close()
-            trans_print('文件读取成功:', file_path, '数据数量:', df.shape, '耗时:', datetime.datetime.now() - begin)
+            debug('文件读取成功:', file_path, '数据数量:', df.shape, '耗时:', datetime.datetime.now() - begin)
         except Exception as e:
-            trans_print('读取文件出错', file_path, str(e))
+            error('读取文件出错', file_path, str(e))
             message = '文件:' + os.path.basename(file_path) + ',' + str(e)
             raise ValueError(message)
 
     return df
 
 
-def __build_directory_dict(directory_dict, path, filter_types=None):
+def __build_directory_dict(directory_dict: Dict[str, List[str]], path: str,
+                           filter_types: Optional[List[str]] = None) -> None:
+    """
+    构建目录文件字典
+    
+    Args:
+        directory_dict: 目录文件字典
+        path: 目录路径
+        filter_types: 文件类型过滤器
+    """
     # 遍历目录下的所有项
     for item in os.listdir(path):
         item_path = os.path.join(path, item)
@@ -156,18 +221,31 @@ def __build_directory_dict(directory_dict, path, filter_types=None):
 
             if filter_types is None or len(filter_types) == 0:
                 directory_dict[path].append(item_path)
-            elif str(item_path).split(".")[-1] in filter_types:
-                if str(item_path).count("~$") == 0:
+            else:
+                # 获取文件扩展名
+                ext = os.path.splitext(item_path)[1].lstrip('.').lower()
+                if ext in filter_types and "~$" not in item_path:
                     directory_dict[path].append(item_path)
 
 
 # 读取路径下所有的excel文件
-def read_excel_files(read_path, filter_types=None):
+def read_excel_files(read_path: str, filter_types: Optional[List[str]] = None) -> List[str]:
+    """
+    读取路径下所有的Excel文件
+    
+    Args:
+        read_path: 读取路径
+        filter_types: 文件类型过滤器
+    
+    Returns:
+        文件路径列表
+    """
     if not os.path.exists(read_path):
         return []
 
     if filter_types is None:
-        filter_types = ['xls', 'xlsx', 'csv', 'gz']
+        # filter_types = ['xls', 'xlsx', 'csv', 'gz']
+        filter_types = FileTypes.EXCEL_TYPES
     if os.path.isfile(read_path):
         return [read_path]
 
@@ -178,10 +256,20 @@ def read_excel_files(read_path, filter_types=None):
 
 
 # 读取路径下所有的文件
-def read_files(read_path, filter_types=None):
+def read_files(read_path: str, filter_types: Optional[List[str]] = None) -> List[str]:
+    """
+    读取路径下所有的文件
+    
+    Args:
+        read_path: 读取路径
+        filter_types: 文件类型过滤器
+    
+    Returns:
+        文件路径列表
+    """
     if filter_types is None:
-        filter_types = [i for i in excel_types]
-        filter_types.extend(zip_types)
+        filter_types = list(FileTypes.EXCEL_TYPES)
+        filter_types.extend(FileTypes.ZIP_TYPES)
     if os.path.isfile(read_path):
         return [read_path]
     directory_dict = {}
@@ -190,10 +278,15 @@ def read_files(read_path, filter_types=None):
     return [path1 for paths in directory_dict.values() for path1 in paths if path1]
 
 
-def copy_to_new(from_path, to_path):
-    is_file = False
-    if to_path.count('.') > 0:
-        is_file = True
+def copy_to_new(from_path: str, to_path: str) -> None:
+    """
+    复制文件到新路径
+    
+    Args:
+        from_path: 源文件路径
+        to_path: 目标文件路径
+    """
+    is_file = '.' in to_path
 
     create_file_path(to_path, is_file_path=is_file)
 
@@ -201,11 +294,13 @@ def copy_to_new(from_path, to_path):
 
 
 # 创建路径
-def create_file_path(read_path, is_file_path=False):
+def create_file_path(read_path: str, is_file_path: bool = False) -> None:
     """
     创建路径
-    :param read_path:创建文件夹的路径
-    :param is_file_path: 传入的path是否包含具体的文件名
+    
+    Args:
+        read_path: 创建文件夹的路径
+        is_file_path: 传入的path是否包含具体的文件名
     """
     if is_file_path:
         read_path = os.path.dirname(read_path)
@@ -214,9 +309,15 @@ def create_file_path(read_path, is_file_path=False):
         os.makedirs(read_path, exist_ok=True)
 
 
-def valid_eval(eval_str):
+def valid_eval(eval_str: str) -> bool:
     """
     验证 eval 是否包含非法的参数
+    
+    Args:
+        eval_str: 要验证的表达式
+    
+    Returns:
+        是否合法
     """
     safe_param = ["column", "wind_name", "df", "error_time", "str", "int"]
     eval_str_names = [node.id for node in ast.walk(ast.parse(eval_str)) if isinstance(node, ast.Name)]

+ 0 - 202
utils/file/trans_methods.py_1

@@ -1,202 +0,0 @@
-# -*- coding: utf-8 -*-
-# @Time    : 2024/5/16
-# @Author  : 魏志亮
-import datetime
-from os import *
-import shutil
-import warnings
-
-import chardet
-import pandas as pd
-
-from utils.log.trans_log import trans_print
-
-warnings.filterwarnings("ignore")
-
-
-# 获取文件编码
-def detect_file_encoding(filename):
-    # 读取文件的前1000个字节(足够用于大多数编码检测)
-    with open(filename, 'rb') as f:
-        rawdata = f.read(1000)
-    result = chardet.detect(rawdata)
-    encoding = result['encoding']
-
-    trans_print("文件类型:", filename, encoding)
-
-    if encoding is None:
-        encoding = 'gb18030'
-
-    if encoding.lower() in ['utf-8', 'ascii', 'utf8']:
-        return 'utf-8'
-
-    return 'gb18030'
-
-
-def del_blank(df=pd.DataFrame(), cols=list()):
-    for col in cols:
-        if df[col].dtype == object:
-            df[col] = df[col].str.strip()
-    return df
-
-
-# 切割数组到多个数组
-def split_array(array, num):
-    return [array[i:i + num] for i in range(0, len(array), num)]
-
-
-def find_read_header(file_path, trans_cols):
-    df = read_file_to_df(file_path, nrows=20)
-    count = 0
-    header = None
-    for col in trans_cols:
-        if col in df.columns:
-            count = count + 1
-            if count >= 2:
-                header = 0
-                break
-
-    count = 0
-
-    values = list()
-    for index, row in df.iterrows():
-        values = list(row.values)
-        if type(row.name) == tuple:
-            values.extend(list(row.name))
-        for col in trans_cols:
-            if col in values:
-                count = count + 1
-                if count > 2:
-                    header = index + 1
-                    break
-
-    read_cols = []
-    for col in values:
-        if col in trans_cols:
-            read_cols.append(col)
-
-    return header, read_cols
-
-
-# 读取数据到df
-def read_file_to_df(file_path, read_cols=list(), trans_cols=None, nrows=None):
-    begin = datetime.datetime.now()
-    trans_print('开始读取文件', file_path)
-    header = 0
-    find_cols = list()
-    if trans_cols:
-        header, find_cols = find_read_header(file_path, trans_cols)
-        trans_print(path.basename(file_path), "读取第", header, "行")
-        if header is None:
-            message = '未匹配到开始行,请检查并重新指定'
-            trans_print(message)
-            raise Exception(message)
-
-    read_cols.extend(find_cols)
-
-    try:
-        df = pd.DataFrame()
-        if str(file_path).lower().endswith("csv") or str(file_path).lower().endswith("gz"):
-            encoding = detect_file_encoding(file_path)
-            end_with_gz = str(file_path).lower().endswith("gz")
-            if read_cols:
-                if end_with_gz:
-                    df = pd.read_csv(file_path, encoding=encoding, usecols=read_cols, compression='gzip', header=header,
-                                     nrows=nrows)
-                else:
-                    df = pd.read_csv(file_path, encoding=encoding, usecols=read_cols, header=header,
-                                     on_bad_lines='warn', nrows=nrows)
-            else:
-
-                if end_with_gz:
-                    df = pd.read_csv(file_path, encoding=encoding, compression='gzip', header=header, nrows=nrows)
-                else:
-                    df = pd.read_csv(file_path, encoding=encoding, header=header, on_bad_lines='warn', nrows=nrows)
-
-        else:
-            xls = pd.ExcelFile(file_path, engine="calamine")
-            # 获取所有的sheet名称
-            sheet_names = xls.sheet_names
-            for sheet_name in sheet_names:
-                if read_cols:
-                    now_df = pd.read_excel(xls, sheet_name=sheet_name, header=header, usecols=read_cols, nrows=nrows)
-                else:
-                    now_df = pd.read_excel(xls, sheet_name=sheet_name, header=header, nrows=nrows)
-
-                now_df['sheet_name'] = sheet_name
-                df = pd.concat([df, now_df])
-            xls.close()
-        trans_print('文件读取成功:', file_path, '数据数量:', df.shape, '耗时:', datetime.datetime.now() - begin)
-    except Exception as e:
-        trans_print('读取文件出错', file_path, str(e))
-        message = '文件:' + path.basename(file_path) + ',' + str(e)
-        raise ValueError(message)
-
-    return df
-
-
-def __build_directory_dict(directory_dict, path, filter_types=None):
-    # 遍历目录下的所有项
-    for item in listdir(path):
-        item_path = path.join(path, item)
-        if path.isdir(item_path):
-            __build_directory_dict(directory_dict, item_path, filter_types=filter_types)
-        elif path.isfile(item_path):
-            if path not in directory_dict:
-                directory_dict[path] = []
-
-            if filter_types is None or len(filter_types) == 0:
-                directory_dict[path].append(item_path)
-            elif str(item_path).split(".")[-1] in filter_types:
-                if str(item_path).count("~$") == 0:
-                    directory_dict[path].append(item_path)
-
-
-# 读取路径下所有的excel文件
-def read_excel_files(read_path):
-    if path.isfile(read_path):
-        return [read_path]
-
-    directory_dict = {}
-    __build_directory_dict(directory_dict, read_path, filter_types=['xls', 'xlsx', 'csv', 'gz'])
-
-    return [path for paths in directory_dict.values() for path in paths if path]
-
-
-# 读取路径下所有的文件
-def read_files(read_path):
-    directory_dict = {}
-    __build_directory_dict(directory_dict, read_path, filter_types=['xls', 'xlsx', 'csv', 'gz', 'zip', 'rar'])
-
-    return [path for paths in directory_dict.values() for path in paths if path]
-
-
-def copy_to_new(from_path, to_path):
-    is_file = False
-    if to_path.count('.') > 0:
-        is_file = True
-
-    create_file_path(to_path, is_file_path=is_file)
-
-    shutil.copy(from_path, to_path)
-
-
-# 创建路径
-def create_file_path(path, is_file_path=False):
-    if is_file_path:
-        path = path.dirname(path)
-
-    if not path.exists(path):
-        makedirs(path, exist_ok=True)
-
-
-if __name__ == '__main__':
-    datas = read_excel_files(r"D:\data\清理数据\招远风电场\WOF053600062-WOB000009_ZYFDC000012\minute")
-    for data in datas:
-        print(data)
-
-    print("*" * 20)
-
-    datas = read_excel_files(r"D:\data\清理数据\招远风电场\WOF053600062-WOB000009_ZYFDC000012\minute\WOG00066.csv.gz")
-    for data in datas:
-        print(data)

+ 99 - 25
utils/log/trans_log.py

@@ -7,6 +7,7 @@ import logging
 import sys
 from os import *
 
+from conf.constants import Log
 from utils.conf.read_conf import read_conf, yaml_conf
 
 
@@ -26,34 +27,107 @@ class ContextFilter(logging.Filter):
         return True
 
 
-logger = logging.getLogger("etl_tools")
-logger.setLevel(logging.INFO)
-stout_handle = logging.StreamHandler(sys.stdout)
-stout_handle.setFormatter(
-    logging.Formatter("%(asctime)s-%(trace_id)s: %(message)s"))
-stout_handle.setLevel(logging.INFO)
-stout_handle.addFilter(ContextFilter())
-logger.addHandler(stout_handle)
+# 初始化日志配置
+def init_logger():
+    """初始化日志配置"""
+    logger = logging.getLogger("etl_tools")
+    logger.setLevel(logging.DEBUG)  # 设置为DEBUG以捕获所有级别的日志
 
-config_path = path.abspath(__file__).split("utils")[0] + 'conf' + sep + 'etl_config_dev.yaml'
-config_path = environ.get('ETL_CONF', config_path)
-config = yaml_conf(environ.get('ETL_CONF', config_path))
-log_path_dir = read_conf(config, 'log_path_dir', "/data/logs")
+    # 清除已有的处理器
+    if logger.handlers:
+        logger.handlers.clear()
 
-log_path = log_path_dir + sep + r'etl_tools_' + (environ['env'] if 'env' in environ else 'dev')
-file_path = path.join(log_path)
+    formatter = logging.Formatter("%(asctime)s-%(levelname)s-%(trace_id)s: %(message)s")
 
-if not path.exists(file_path):
-    makedirs(file_path, exist_ok=True)
-file_name = file_path + sep + str(datetime.date.today()) + '.log'
+    # 控制台处理器
+    stout_handle = logging.StreamHandler(sys.stdout)
+    stout_handle.setFormatter(formatter)
 
-file_handler = logging.FileHandler(file_name, encoding='utf-8')
-file_handler.setFormatter(
-    logging.Formatter("%(asctime)s-%(trace_id)s: %(message)s"))
-file_handler.setLevel(logging.INFO)
-file_handler.addFilter(ContextFilter())
-logger.addHandler(file_handler)
+    # 根据环境设置日志级别
+    env = environ.get('env', 'dev')
 
+    stout_handle.setLevel(logging.INFO)
 
-def trans_print(*args):
-    logger.info("  ".join([str(a) for a in args]))
+    stout_handle.addFilter(ContextFilter())
+    logger.addHandler(stout_handle)
+
+    # 文件处理器
+    try:
+        config_path = environ.get('ETL_CONF')
+        if config_path:
+            config = yaml_conf(config_path)
+            log_path_dir = read_conf(config, 'log_path_dir', Log.DEFAULT_LOG_PATH)
+        else:
+            log_path_dir = Log.DEFAULT_LOG_PATH
+
+        log_path = log_path_dir + sep + Log.LOG_FILE_PREFIX + (environ['env'] if 'env' in environ else 'dev')
+        file_path = path.join(log_path)
+
+        if not path.exists(file_path):
+            makedirs(file_path, exist_ok=True)
+        # 普通日志文件(INFO及以上)
+        file_name = file_path + sep + str(datetime.date.today()) + '.log'
+        file_handler = logging.FileHandler(file_name, encoding='utf-8')
+        file_handler.setFormatter(formatter)
+        file_handler.setLevel(logging.INFO)
+        file_handler.addFilter(ContextFilter())
+        logger.addHandler(file_handler)
+
+        # 错误日志文件(ERROR及以上)
+        error_file_name = file_path + sep + str(datetime.date.today()) + '.error.log'
+        error_file_handler = logging.FileHandler(error_file_name, encoding='utf-8')
+        error_file_handler.setFormatter(formatter)
+        error_file_handler.setLevel(logging.ERROR)
+        error_file_handler.addFilter(ContextFilter())
+        logger.addHandler(error_file_handler)
+    except Exception as e:
+        # 如果日志文件创建失败,只使用控制台日志
+        pass
+
+    return logger
+
+
+# 初始化日志记录器
+logger = init_logger()
+
+
+def trans_print(*args, level: str = 'info'):
+    """
+    打印日志
+    
+    Args:
+        *args: 日志内容
+        level: 日志级别,可选值: 'debug', 'info', 'warning', 'error'
+    """
+    message = "  ".join([str(a) for a in args])
+
+    if level == 'debug':
+        logger.debug(message)
+    elif level == 'info':
+        logger.info(message)
+    elif level == 'warning':
+        logger.warning(message)
+    elif level == 'error':
+        logger.error(message)
+    else:
+        logger.info(message)
+
+
+def debug(*args):
+    """打印调试日志"""
+    trans_print(*args, level='debug')
+
+
+def info(*args):
+    """打印信息日志"""
+    trans_print(*args, level='info')
+
+
+def warning(*args):
+    """打印警告日志"""
+    trans_print(*args, level='warning')
+
+
+def error(*args):
+    """打印错误日志"""
+    trans_print(*args, level='error')

+ 113 - 26
utils/systeminfo/sysinfo.py

@@ -1,13 +1,21 @@
-from os import *
+import os
+from typing import List
 
 import psutil
 
-from utils.log.trans_log import trans_print
+from conf.constants import ParallelProcessing
+from utils.log.trans_log import info, debug
 
 
-def print_memory_usage(detail=""):
+def print_memory_usage(detail: str = "") -> None:
+    """
+    打印内存使用情况
+    
+    Args:
+        detail: 详细信息
+    """
     # 获取当前进程ID
-    pid = getpid()
+    pid = os.getpid()
     # 获取进程信息
     py = psutil.Process(pid)
     # 获取内存信息
@@ -21,34 +29,85 @@ def print_memory_usage(detail=""):
     memory_usage_rss_mb = memory_usage_rss / (1024 ** 2)
     memory_usage_vms_mb = memory_usage_vms / (1024 ** 2)
 
-    trans_print(f"{detail},Memory usage (RSS): {memory_usage_rss_mb:.2f} MB")
-    trans_print(f"{detail},Memory usage (VMS): {memory_usage_vms_mb:.2f} MB")
+    debug(f"{detail},Memory usage (RSS): {memory_usage_rss_mb:.2f} MB")
+    debug(f"{detail},Memory usage (VMS): {memory_usage_vms_mb:.2f} MB")
 
 
-def get_cpu_count():
+def get_cpu_count() -> int:
+    """
+    获取CPU核心数
+    
+    Returns:
+        CPU核心数
+    """
     return psutil.cpu_count()
 
 
-def get_available_cpu_count_with_percent(percent: float = 1):
+def get_available_cpu_count_with_percent(percent: float = 1) -> int:
+    """
+    根据百分比获取可用CPU数
+    
+    Args:
+        percent: CPU使用百分比
+    
+    Returns:
+        可用CPU数
+    """
     cpu_count = get_cpu_count()
     return int(cpu_count * percent)
 
 
-def get_file_size(file_path):
-    return path.getsize(file_path)
-
-
-def get_dir_size(dir_path):
-    return sum(get_file_size(path.join(dir_path, file)) for file in listdir(dir_path) if
-               path.isfile(path.join(dir_path, file)))
-
-
-def get_available_memory_with_percent(percent: float = 1):
+def get_file_size(file_path: str) -> int:
+    """
+    获取文件大小
+    
+    Args:
+        file_path: 文件路径
+    
+    Returns:
+        文件大小(字节)
+    """
+    return os.path.getsize(file_path)
+
+
+def get_dir_size(dir_path: str) -> int:
+    """
+    获取目录大小
+    
+    Args:
+        dir_path: 目录路径
+    
+    Returns:
+        目录大小(字节)
+    """
+    return sum(get_file_size(os.path.join(dir_path, file)) for file in os.listdir(dir_path) if
+               os.path.isfile(os.path.join(dir_path, file)))
+
+
+def get_available_memory_with_percent(percent: float = 1) -> int:
+    """
+    根据百分比获取可用内存
+    
+    Args:
+        percent: 内存使用百分比
+    
+    Returns:
+        可用内存(字节)
+    """
     memory_info = psutil.virtual_memory()
     return int(memory_info.available * percent)
 
 
-def get_max_file_size(file_paths: list[str]):
+def get_max_file_size(file_paths: List[str]) -> int:
+    """
+    获取文件列表中的最大文件大小
+    
+    Args:
+        file_paths: 文件路径列表
+    
+    Returns:
+        最大文件大小(字节)
+    """
     max_size = 0
     for file_path in file_paths:
         file_size = get_file_size(file_path)
@@ -57,11 +116,25 @@ def get_max_file_size(file_paths: list[str]):
     return max_size
 
 
-def use_files_get_max_cpu_count(file_paths: list[str], memory_percent: float = 1 / 12, cpu_percent: float = 2 / 5):
+def use_files_get_max_cpu_count(file_paths: List[str], memory_percent: float = 1 / 12,
+                                cpu_percent: float = 2 / 5) -> int:
+    """
+    根据文件大小和内存情况计算最大进程数
+    
+    Args:
+        file_paths: 文件路径列表
+        memory_percent: 内存使用百分比
+        cpu_percent: CPU使用百分比
+    
+    Returns:
+        最大进程数
+    """
     max_file_size = get_max_file_size(file_paths)
     free_memory = get_available_memory_with_percent(memory_percent)
     count = int(free_memory / max_file_size)
     max_cpu_count = get_available_cpu_count_with_percent(cpu_percent)
+    # 限制最大进程数
+    max_cpu_count = min(max_cpu_count, ParallelProcessing.MAX_PROCESSES)
     result = count if count <= max_cpu_count else max_cpu_count
     if result == 0:
         result = 1
@@ -69,21 +142,35 @@ def use_files_get_max_cpu_count(file_paths: list[str], memory_percent: float = 1
     if result > len(file_paths):
         result = len(file_paths)
 
-    trans_print("总文件数:", len(file_paths), ",获取最大文件大小:", str(round(max_file_size / 2 ** 20, 2)) + "M",
-                "可用内存:", str(get_available_memory_with_percent(1) / 2 ** 20) + "M",
-                "总CPU数:", get_cpu_count(), "CPU使用比例:", round(cpu_percent, 2), "CPU可用数量:", max_cpu_count,
-                ",最终确定使用进程数:", result)
+    info("总文件数:", len(file_paths), ",获取最大文件大小:", str(round(max_file_size / 2 ** 20, 2)) + "M",
+         "可用内存:", str(get_available_memory_with_percent(1) / 2 ** 20) + "M",
+         "总CPU数:", get_cpu_count(), "CPU使用比例:", round(cpu_percent, 2), "CPU可用数量:", max_cpu_count,
+         ",最终确定使用进程数:", result)
     return result
 
 
-def max_file_size_get_max_cpu_count(max_file_size, memory_percent: float = 1 / 6, cpu_percent: float = 2 / 5):
+def max_file_size_get_max_cpu_count(max_file_size: int, memory_percent: float = 1 / 6,
+                                    cpu_percent: float = 2 / 5) -> int:
+    """
+    根据最大文件大小和内存情况计算最大进程数
+    
+    Args:
+        max_file_size: 最大文件大小
+        memory_percent: 内存使用百分比
+        cpu_percent: CPU使用百分比
+    
+    Returns:
+        最大进程数
+    """
     free_memory = get_available_memory_with_percent(memory_percent)
     count = int(free_memory / max_file_size)
     max_cpu_count = get_available_cpu_count_with_percent(cpu_percent)
+    # 限制最大进程数
+    max_cpu_count = min(max_cpu_count, ParallelProcessing.MAX_PROCESSES)
     result = count if count <= max_cpu_count else max_cpu_count
     if result == 0:
         result = 1
-    trans_print(",获取最大文件大小:", str(round(max_file_size / 2 ** 20, 2)) + "M",
+    info(",获取最大文件大小:", str(round(max_file_size / 2 ** 20, 2)) + "M",
                 "可用内存:", str(get_available_memory_with_percent(1) / 2 ** 20) + "M",
                 "总CPU数:", get_cpu_count(), "CPU使用比例:", round(cpu_percent, 2), "CPU可用数量:", max_cpu_count,
                 ",最终确定使用进程数:", result)

+ 0 - 0
utils/tmp_util/__init__.py


+ 0 - 37
utils/tmp_util/合并文件.py

@@ -1,37 +0,0 @@
-import multiprocessing
-
-read_dir = r'/data/download/collection_data/1进行中/张崾先风电场-陕西-华电/收资数据/整改复核数据/2025年06月19日16时17分41秒'
-
-import os
-import pandas as pd
-
-# 获取文件夹下所有文件的路径
-file_paths = [os.path.join(read_dir, file) for file in os.listdir(read_dir) if
-              os.path.isfile(os.path.join(read_dir, file))]
-
-
-def read_and_save(wind_no, files, save_dir):
-    # 读取文件
-    df = pd.concat([pd.read_csv(file) for file in files])
-
-    # 保存文件
-    df.to_csv(os.path.join(save_dir, f'{wind_no}.csv'), index=False, encoding='utf-8')
-
-
-if __name__ == '__main__':
-
-    wind_dicts = dict()
-
-    save_dir = r'/data/download/collection_data/1进行中/张崾先风电场-陕西-华电/收资数据/整改复核数据/合并202506191654'
-
-    os.makedirs(save_dir, exist_ok=True)
-
-    for file in os.listdir(read_dir):
-        wind_no = file.split('(')[0]
-        if wind_no not in wind_dicts:
-            wind_dicts[wind_no] = [os.path.join(read_dir, file)]
-        else:
-            wind_dicts[wind_no].append(os.path.join(read_dir, file))
-
-    with multiprocessing.Pool(20) as pool:
-        pool.starmap(read_and_save, [(key, files, save_dir) for key, files in wind_dicts.items()])

+ 0 - 100
utils/tmp_util/整理INSERT到批量INSERT.py

@@ -1,100 +0,0 @@
-# coding=utf-8
-
-
-import re
-from collections import defaultdict
-
-import pymysql
-
-
-def read_sql_inserts(file_path):
-    """生成器函数,逐行读取INSERT语句"""
-    with open(file_path, 'r', encoding='utf-8') as f:
-        for line in f:
-            line = line.strip()
-            if line.startswith('INSERT INTO'):
-                yield line
-
-
-def process_large_sql_file(input_file, batch_size=10000):
-    table_data = defaultdict(lambda: {
-        'columns': None,
-        'value_rows': []
-    })
-
-    insert_pattern = re.compile(
-        r'INSERT\s+INTO\s+`?([a-zA-Z_][a-zA-Z0-9_]*)`?\s*\((.*?)\)\s*VALUES\s*\((.*?)\);',
-        re.IGNORECASE
-    )
-
-    # 使用生成器处理
-    for insert_stmt in read_sql_inserts(input_file):
-        match = insert_pattern.match(insert_stmt)
-        if match:
-            table_name = match.group(1)
-            columns = match.group(2)
-            values = match.group(3)
-
-            if table_data[table_name]['columns'] is None:
-                table_data[table_name]['columns'] = columns
-
-            table_data[table_name]['value_rows'].append(values)
-
-    # 生成批量INSERT语句
-    batch_inserts = {}
-    for table_name, data in table_data.items():
-        columns = data['columns']
-        value_rows = data['value_rows']
-
-        for i in range(0, len(value_rows), batch_size):
-            batch_values = value_rows[i:i + batch_size]
-            batch_insert = f"INSERT INTO `{table_name}` ({columns}) VALUES\n"
-            batch_insert += ",\n".join([f"({values})" for values in batch_values])
-            batch_insert += ";"
-
-            if table_name not in batch_inserts:
-                batch_inserts[table_name] = []
-            batch_inserts[table_name].append(batch_insert)
-
-    return batch_inserts
-
-
-def execute_batch_inserts(db_config, batch_inserts):
-    """直接执行批量INSERT到数据库"""
-    connection = pymysql.connect(**db_config)
-    try:
-        with connection.cursor() as cursor:
-            for table_name, inserts in batch_inserts.items():
-                for index, insert_sql in enumerate(inserts):
-                    cursor.execute(insert_sql)
-                    print(f"表 {table_name},共 {len(inserts)} 个, 第 {index + 1} 个批量INSERT语句执行成功")
-        connection.commit()
-    finally:
-        connection.close()
-
-
-# 数据库配置
-db_config = {
-    'host': '192.168.50.235',
-    'user': 'root',
-    'password': 'admin123456',
-    'db': 'wtlivedb_1',
-    'charset': 'utf8mb4'
-}
-
-"""
-移除INSERT 语句 其他的就是建表语句了
-cat file |grep -v 'INSERT ' > create_talbe.sql
-下面是 INSERT 转化为  BATCH INSERT 的脚本
-"""
-
-if __name__ == "__main__":
-    input_file = "wtlivedb.sql"
-
-    # 使用
-    batch_inserts = process_large_sql_file("input.sql")
-    execute_batch_inserts(db_config, batch_inserts)
-
-    # 打印统计信息
-    for table_name, inserts in batch_inserts.items():
-        print(f"表 '{table_name}': {len(inserts)} 个批量INSERT语句")

+ 0 - 87
utils/tmp_util/神木_完整度_10分.py

@@ -1,87 +0,0 @@
-# coding=utf-8
-
-import datetime
-import multiprocessing
-import os
-import sys
-
-sys.path.insert(0, os.path.abspath(__file__).split("utils")[0])
-
-import pandas as pd
-
-from utils.file.trans_methods import read_file_to_df, read_excel_files
-
-
-def get_time_space_count(start_time: datetime.datetime, end_time: datetime.datetime, time_space=1):
-    """
-    获取俩个时间之间的个数
-    :return: 查询时间间隔
-    """
-    delta = end_time - start_time
-    total_seconds = delta.days * 24 * 60 * 60 + delta.seconds
-
-    return abs(int(total_seconds / time_space)) + 1
-
-
-def save_percent(value, save_decimal=7):
-    return round(value, save_decimal) * 100
-
-
-def read_and_select(file_path, read_cols_bak):
-    try:
-        read_cols = read_cols_bak[0:len(read_cols_bak)]
-        result_df = pd.DataFrame()
-        df = read_file_to_df(file_path, read_cols=read_cols)
-        wind_name = df['名称'].values[0]
-        df['时间'] = pd.to_datetime(df['时间'])
-        count = get_time_space_count(df['时间'].min(), df['时间'].max(), 600)
-        repeat_time_count = df.shape[0] - len(df['时间'].unique())
-        print(wind_name, count, repeat_time_count)
-        result_df['风机号'] = [wind_name]
-        result_df['重复率'] = [save_percent(repeat_time_count / count)]
-        result_df['重复次数'] = [repeat_time_count]
-        result_df['总记录数'] = [count]
-
-        read_cols.remove('名称')
-        for read_col in read_cols:
-
-            if read_col != '时间':
-                df[read_col] = pd.to_numeric(df[read_col], errors='coerce')
-            else:
-                df[read_col] = pd.to_datetime(df[read_col], errors='coerce')
-
-        group_df = df.groupby(by=['名称']).count()
-        group_df.reset_index(inplace=True)
-        count_df = pd.DataFrame(group_df)
-        total_count = count_df[read_cols].values[0].sum()
-        print(wind_name, total_count, count * len(read_cols))
-        result_df['平均缺失率,单位%'] = [save_percent(1 - total_count / (count * len(read_cols)))]
-        result_df['缺失数值'] = [
-            '-'.join([f'{col_name}_{str(count - i)}' for col_name, i in zip(read_cols, count_df[read_cols].values[0])])]
-        del group_df
-
-        error_fengsu_count = df.query("(风速 < 0) | (风速 > 80)").shape[0]
-        error_yougong_gonglv = df.query("(发电机有功功率 < -200) | (发电机有功功率 > 2500)").shape[0]
-
-        result_df['平均异常率'] = [save_percent((error_fengsu_count + error_yougong_gonglv) / (2 * count))]
-    except Exception as e:
-        print(file_path)
-        raise e
-
-    return result_df
-
-
-if __name__ == '__main__':
-    read_cols_str = '名称,时间,发电机有功功率,发电机转速,发电机驱动端轴承温度,发电机非驱动端轴承温度,发电机定子U相线圈温度,发电机定子V相线圈温度,发电机定子W相线圈温度,实际扭矩,设定扭矩,仪表盘风速,舱内温度,控制柜内温度,舱外温度,风向,风速,机舱风向夹角,1#桨叶片角度,1#桨设定角度,2#桨叶片角度,2#桨设定角度,3#桨叶片角度,3#桨设定角度,1#桨电机温度,2#桨电机温度,3#桨电机温度,轮毂内温度,齿轮箱油泵吸油口油压,齿轮箱分配器位置油压,偏航液压刹车系统蓄能罐压力,主轴转速,齿轮箱油路入口温度,齿轮箱中间轴驱动端轴承温度,齿轮箱中间轴非驱动端轴承温度,齿轮箱油池温度,主轴承外圈温度,可利用率,机舱位置,总扭缆角度'
-    read_cols = [i for i in read_cols_str.split(",") if i]
-    read_dir = r'D:\data\tmp_data\10分'
-
-    files = read_excel_files(read_dir)
-
-    with multiprocessing.Pool(4) as pool:
-        dfs = pool.starmap(read_and_select, [(os.path.join(read_dir, i), read_cols) for i in files])
-
-    df = pd.concat(dfs, ignore_index=True)
-    df.sort_values(by=['风机号'], inplace=True)
-
-    df.to_csv("神木风电场-10分钟.csv", encoding='utf8', index=False)

+ 0 - 90
utils/tmp_util/神木_完整度_1分.py

@@ -1,90 +0,0 @@
-# coding=utf-8
-
-import datetime
-import multiprocessing
-import os
-import sys
-
-sys.path.insert(0, os.path.abspath(__file__).split("utils")[0])
-
-import pandas as pd
-
-from utils.file.trans_methods import read_file_to_df, read_excel_files
-
-
-def get_time_space_count(start_time: datetime.datetime, end_time: datetime.datetime, time_space=1):
-    """
-    获取俩个时间之间的个数
-    :return: 查询时间间隔
-    """
-    delta = end_time - start_time
-    total_seconds = delta.days * 24 * 60 * 60 + delta.seconds
-
-    return abs(int(total_seconds / time_space)) + 1
-
-
-def save_percent(value, save_decimal=7):
-    return round(value, save_decimal) * 100
-
-
-def read_and_select(file_path):
-    try:
-        result_df = pd.DataFrame()
-        df = read_file_to_df(file_path)
-        read_cols_bak = df.columns.tolist()
-
-        wind_name = df['名称'].values[0]
-        df['时间'] = pd.to_datetime(df['时间'])
-        count = get_time_space_count(df['时间'].min(), df['时间'].max(), 60)
-        repeat_time_count = df.shape[0] - len(df['时间'].unique())
-        print(wind_name, count, repeat_time_count)
-        result_df['风机号'] = [wind_name]
-        result_df['重复率'] = [save_percent(repeat_time_count / count)]
-        result_df['重复次数'] = [repeat_time_count]
-        result_df['总记录数'] = [count]
-
-        read_cols_bak.remove('名称')
-        read_cols = list()
-        for read_col in read_cols_bak:
-
-            if read_col == '时间':
-                df[read_col] = pd.to_datetime(df[read_col], errors='coerce')
-                read_cols.append(read_col)
-            else:
-                df[read_col] = pd.to_numeric(df[read_col], errors='coerce')
-                if not df[read_col].isnull().all():
-                    read_cols.append(read_col)
-
-        group_df = df.groupby(by=['名称']).count()
-        group_df.reset_index(inplace=True)
-        count_df = pd.DataFrame(group_df)
-        total_count = count_df[read_cols].values[0].sum()
-        print(wind_name, total_count, count * len(read_cols))
-        result_df['平均缺失率,单位%'] = [save_percent(1 - total_count / (count * len(read_cols)))]
-        result_df['缺失数值'] = [
-            '-'.join([f'{col_name}_{str(count - i)}' for col_name, i in zip(read_cols, count_df[read_cols].values[0])])]
-        del group_df
-
-        error_fengsu_count = df.query("(风速 < 0) | (风速 > 80)").shape[0]
-        error_yougong_gonglv = df.query("(发电机有功功率 < -200) | (发电机有功功率 > 2500)").shape[0]
-
-        result_df['平均异常率'] = [save_percent((error_fengsu_count + error_yougong_gonglv) / (2 * count))]
-    except Exception as e:
-        print(file_path)
-        raise e
-
-    return result_df
-
-
-if __name__ == '__main__':
-    read_dir = r'D:\data\tmp_data\1分\远景1min'
-
-    files = read_excel_files(read_dir)
-
-    with multiprocessing.Pool(4) as pool:
-        dfs = pool.map(read_and_select, files)
-
-    df = pd.concat(dfs, ignore_index=True)
-    df.sort_values(by=['风机号'], inplace=True)
-
-    df.to_csv("神木风电场-1分钟.csv", encoding='utf8', index=False)

+ 0 - 18
utils/tmp_util/获取台账所有wind表信息.py

@@ -1,18 +0,0 @@
-import sys
-from os import path, environ
-
-env = 'dev'
-if len(sys.argv) >= 2:
-    env = sys.argv[1]
-
-conf_path = path.abspath(__file__).split("energy-data-trans")[0] + f"/energy-data-trans/conf/etl_config_{env}.yaml"
-environ['ETL_CONF'] = conf_path
-environ['env'] = env
-
-from service.common_connect import plt
-
-tables = 'wind_company,wind_engine_group,wind_engine_mill,wind_exception_count,wind_field,wind_field_batch,wind_field_contract,wind_field_resource,wind_relation'
-
-for table in tables.split(','):
-    df = plt.read_sql_to_df(f"select * from {table}")
-    df.to_csv(table + '.csv', encoding='utf8', index=False)

+ 0 - 76
utils/tmp_util/表添加列.py

@@ -1,76 +0,0 @@
-import os
-import sys
-
-env = 'prod'
-if len(sys.argv) >= 2:
-    env = sys.argv[1]
-
-conf_path = os.path.abspath(__file__).split("energy-data-trans")[0] + f"/energy-data-trans/conf/etl_config_{env}.yaml"
-os.environ['ETL_CONF'] = conf_path
-os.environ['env'] = env
-
-db_last = ''
-if env != 'dev':
-    db_last = db_last + '_' + env
-
-query_sql = f"""
-SELECT
-	t.TABLE_NAME
-FROM
-	information_schema.`TABLES` t
-WHERE
-	t.TABLE_SCHEMA = 'energy_data{db_last}'
-AND t.TABLE_NAME LIKE 'WOF%%_minute'
-AND t.TABLE_NAME NOT IN (
-	SELECT
-		table_name
-	FROM
-		information_schema.`COLUMNS` a
-	WHERE
-		a.TABLE_SCHEMA = 'energy_data{db_last}'
-	AND a.TABLE_NAME LIKE 'WOF%%_minute'
-	AND a.COLUMN_NAME = 'main_bearing_temperature_2'
-)
-"""
-
-
-def get_table_count(table_name):
-    query_begin = time.time()
-    query_sql = f"""
-    select count(1) as count from {table_name}
-    """
-    print(table_name, '统计条数耗时', time.time() - query_begin, trans.execute(query_sql)[0]['count'])
-
-
-def get_update_sql(table_name):
-    update_sql = f"""
-        ALTER TABLE {table_name}
-        ADD COLUMN `main_bearing_temperature_2` double DEFAULT NULL COMMENT '主轴承轴承温度2', 
-        ADD COLUMN `grid_a_phase_current` double DEFAULT NULL COMMENT '电网A相电流',
-        ADD COLUMN `grid_b_phase_current` double DEFAULT NULL COMMENT '电网B相电流',
-        ADD COLUMN `grid_c_phase_current` double DEFAULT NULL COMMENT '电网C相电流',
-        ADD COLUMN `reactive_power` double DEFAULT NULL COMMENT '无功功率';
-        """
-    return update_sql
-
-
-if __name__ == '__main__':
-    from service.common_connect import trans
-
-    # tables = trans.execute(query_sql)
-    # print(tables)
-
-    tables = list()
-    tables.append({'TABLE_NAME': 'WOF093400005_minute'})
-
-    import time
-
-    begin_time = time.time()
-    for table in tables:
-        table_name = '`' + table['TABLE_NAME'] + '`'
-        get_table_count(table_name)
-        update_time = time.time()
-        trans.execute(get_update_sql(table_name))
-        print(table_name, '更新耗时', time.time() - update_time)
-
-    print(len(tables), '张表,总耗时:', time.time() - begin_time)

+ 0 - 49
utils/tmp_util/表添加注释.py

@@ -1,49 +0,0 @@
-import os
-import sys
-
-env = 'tidbprod'
-if len(sys.argv) >= 2:
-    env = sys.argv[1]
-
-conf_path = os.path.abspath(__file__).split("energy-data-trans")[0] + f"/energy-data-trans/conf/etl_config_{env}.yaml"
-os.environ['ETL_CONF'] = conf_path
-os.environ['env'] = env
-
-from service.common_connect import trans, plt
-
-
-def get_all_tables():
-    query_sql = f"""
-    
-    SELECT 
-        t.TABLE_NAME
-    FROM
-        information_schema.`TABLES` t
-    WHERE
-        t.TABLE_SCHEMA = 'energy_data_prod'
-"""
-
-    return trans.execute(query_sql)
-
-
-def get_all_wind_company():
-    query_sql = "SELECT t.field_code,t.field_name FROM wind_field t where t.del_state = 0"
-    datas = plt.execute(query_sql)
-    result_dict = dict()
-    for data in datas:
-        result_dict[data['field_code']] = data['field_name']
-
-    return result_dict
-
-
-if __name__ == '__main__':
-    code_name_dict = get_all_wind_company()
-    tables = get_all_tables()
-    for table in tables:
-        table_name = table['TABLE_NAME']
-
-        if table_name.startswith('WOF'):
-            field_code = table_name.split('_')[0].split('-')[0]
-            if field_code in code_name_dict.keys():
-                update_sql = f"ALTER TABLE `{table_name}` COMMENT = '{code_name_dict[field_code]}'"
-                trans.execute(update_sql)

+ 0 - 27
utils/tmp_util/颗粒度变大.py

@@ -1,27 +0,0 @@
-import os
-
-import pandas as pd
-
-
-def trans_time_granularity(read_dir: str, save_dir: str, time_str: str, time_granularity: str, group_by: list):
-    for root, dirs, files in os.walk(read_dir):
-        for file in files:
-            file_path = os.path.join(root, file)
-            df = pd.read_csv(file_path)
-            # df = df.drop(index=0)
-            df[time_str] = pd.to_datetime(df[time_str], errors='coerce')
-            df[time_str] = df[time_str].dt.ceil(time_granularity)
-            groupby_df = df.groupby(group_by).mean(numeric_only=True).reset_index()
-
-            save_file = file_path.replace(read_dir, save_dir)
-            if not os.path.exists(os.path.dirname(save_file)):
-                os.makedirs(os.path.dirname(save_file))
-
-            groupby_df.to_csv(save_file, index=False, encoding='utf-8')
-
-
-if __name__ == '__main__':
-    read_dir = r'D:\data\tmp_data\龙源\minute'
-    save_dir = r'D:\data\tmp_data\龙源\minute12'
-
-    trans_time_granularity(read_dir, save_dir, 'time_stamp', '20min', ['time_stamp'])

+ 98 - 53
utils/zip/unzip.py

@@ -1,17 +1,27 @@
 # -*- coding: utf-8 -*-
 # @Time    : 2024/5/17
 # @Author  : 魏志亮
+import os
 import traceback
 import zipfile
-from os import *
+from typing import Tuple, Optional
 
 import rarfile
 
-from utils.file.trans_methods import detect_file_encoding
-from utils.log.trans_log import trans_print, logger
+from utils.file.trans_methods import detect_file_encoding, create_file_path
+from utils.log.trans_log import debug, error
 
 
-def __support_gbk(zip_file: zipfile.ZipFile):
+def __support_gbk(zip_file: zipfile.ZipFile) -> zipfile.ZipFile:
+    """
+    支持GBK编码的zip文件
+    
+    Args:
+        zip_file: ZipFile对象
+    
+    Returns:
+        处理后的ZipFile对象
+    """
     name_to_info = zip_file.NameToInfo
     # copy map first
     for name, info in name_to_info.copy().items():
@@ -23,18 +33,31 @@ def __support_gbk(zip_file: zipfile.ZipFile):
     return zip_file
 
 
-def unzip(zip_filepath, dest_path):
+def unzip(zip_filepath: str, dest_path: str) -> Tuple[bool, Optional[Exception]]:
+    """
+    解压zip文件
+    
+    Args:
+        zip_filepath: zip文件路径
+        dest_path: 解压目标路径
+    
+    Returns:
+        (是否成功, 错误信息)
+    """
     # 解压zip文件
     is_success = True
-    trans_print('开始读取文件:', zip_filepath)
-    trans_print("解压到:", dest_path)
+    debug('开始读取文件:', zip_filepath)
+    debug("解压到:", dest_path)
+
+    # 确保目标路径存在
+    create_file_path(dest_path)
 
     try:
         if detect_file_encoding(zip_filepath).startswith("gb"):
             try:
-                with __support_gbk(zipfile.ZipFile(zip_filepath, 'r'))  as zip_ref:
+                with __support_gbk(zipfile.ZipFile(zip_filepath, 'r')) as zip_ref:
                     zip_ref.extractall(dest_path)
-            except:
+            except Exception:
                 with zipfile.ZipFile(zip_filepath, 'r') as zip_ref:
                     zip_ref.extractall(dest_path)
         else:
@@ -42,46 +65,60 @@ def unzip(zip_filepath, dest_path):
                 zip_ref.extractall(dest_path)
 
     except zipfile.BadZipFile as e:
-        trans_print(traceback.format_exc())
+        error(traceback.format_exc())
+        is_success = False
+        error('不是zip文件:', zip_filepath)
+        return is_success, e
+    except Exception as e:
+        error(traceback.format_exc())
         is_success = False
-        trans_print('不是zip文件:', zip_filepath)
+        error('解压文件出错:', zip_filepath, str(e))
         return is_success, e
 
     # 遍历解压后的文件
-    dest_path = dest_path
-    trans_print('解压再次读取', dest_path)
+    debug('解压再次读取', dest_path)
     if is_success:
-        for root, dirs, files in walk(dest_path):
+        for root, dirs, files in os.walk(dest_path):
             for file in files:
-                file_path = path.join(root, file)
+                file_path = os.path.join(root, file)
                 # 检查文件是否是zip文件
                 if file_path.endswith('.zip'):
                     if file_path.endswith('.csv.zip'):
-                        rename(file_path, file_path.replace(".csv.zip", ".csv.gz"))
+                        os.rename(file_path, file_path.replace(".csv.zip", ".csv.gz"))
                     else:
                         # 如果是,递归解压
-                        unzip(file_path, dest_path + sep + get_desc_path(str(file)))
-                        # 删除已解压的zip文件(可选)
-                        remove(file_path)
-                    # 检查文件是否是zip文件
-                if file_path.endswith('.rar'):
+                        unzip(file_path, os.path.join(dest_path, get_desc_path(str(file))))
+                        # 删除已解压的zip文件
+                        os.remove(file_path)
+                # 检查文件是否是rar文件
+                elif file_path.endswith('.rar'):
                     # 如果是,递归解压
-                    unrar(file_path, dest_path + sep + get_desc_path(str(file)))
-                    # 删除已解压的zip文件(可选)
-                    remove(file_path)
-
-    return is_success, ''
-
-
-def unrar(rar_file_path, dest_dir):
-    # 检查目标目录是否存在,如果不存在则创建
-    # 解压zip文件
+                    unrar(file_path, os.path.join(dest_path, get_desc_path(str(file))))
+                    # 删除已解压的rar文件
+                    os.remove(file_path)
+
+    return is_success, None
+
+
+def unrar(rar_file_path: str, dest_dir: str) -> Tuple[bool, Optional[Exception]]:
+    """
+    解压rar文件
+    
+    Args:
+        rar_file_path: rar文件路径
+        dest_dir: 解压目标目录
+    
+    Returns:
+        (是否成功, 错误信息)
+    """
+    # 解压rar文件
     is_success = True
-    trans_print('开始读取文件:', rar_file_path)
+    debug('开始读取文件:', rar_file_path)
     dest_path = dest_dir
-    trans_print("解压到:", dest_path)
-    if not path.exists(dest_path):
-        makedirs(dest_path)
+    debug("解压到:", dest_path)
+
+    # 确保目标路径存在
+    create_file_path(dest_path)
 
     try:
         # 打开RAR文件
@@ -91,33 +128,41 @@ def unrar(rar_file_path, dest_dir):
                 # 解压文件到目标目录
                 rf.extract(member, dest_path)
     except Exception as e:
-        trans_print(traceback.format_exc())
-        logger.exception(e)
+        error(traceback.format_exc())
         is_success = False
-        trans_print('不是rar文件:', rar_file_path)
+        error('不是rar文件:', rar_file_path)
         return is_success, e
 
     # 遍历解压后的文件
-    print('解压再次读取', dest_path)
+    debug('解压再次读取', dest_path)
     if is_success:
-        for root, dirs, files in walk(dest_path):
+        for root, dirs, files in os.walk(dest_path):
             for file in files:
-                file_path = path.join(root, file)
-                # 检查文件是否是zip文件
+                file_path = os.path.join(root, file)
+                # 检查文件是否是rar文件
                 if file_path.endswith('.rar'):
                     # 如果是,递归解压
                     unrar(file_path, get_desc_path(file_path))
-                    # 删除已解压的zip文件(可选)
-                    remove(file_path)
+                    # 删除已解压的rar文件
+                    os.remove(file_path)
 
-                if file_path.endswith('.zip'):
+                elif file_path.endswith('.zip'):
                     # 如果是,递归解压
                     unzip(file_path, get_desc_path(file_path))
-                    # 删除已解压的zip文件(可选)
-                    remove(file_path)
-
-    return is_success, ''
-
-
-def get_desc_path(path):
-    return path[0:path.rfind(".")]
+                    # 删除已解压的zip文件
+                    os.remove(file_path)
+
+    return is_success, None
+
+
+def get_desc_path(file_path: str) -> str:
+    """
+    获取文件路径的描述路径(去除扩展名)
+    
+    Args:
+        file_path: 文件路径
+    
+    Returns:
+        去除扩展名的路径
+    """
+    return file_path[0:file_path.rfind(".")]

+ 0 - 0
wind_farm/CGN/__init__.py


+ 0 - 83
wind_farm/CGN/minute_data.py

@@ -1,83 +0,0 @@
-import datetime
-import logging
-import os
-
-import pandas as pd
-import sys
-from sqlalchemy import create_engine
-
-# 更新为第三方数据源
-engine = create_engine('mysql+pymysql://root:admin123456@192.168.50.235:30306/appoint')
-
-base_dir = r'/data/logs/104'
-save_dir = base_dir + os.sep + 'minute'
-log_dir = base_dir + os.sep + 'logs' + os.sep + 'minute'
-
-wind_farm_code_dict = {
-    '风场编号1': '山西风场',
-    '风场编号2': '桂林风场'
-}
-
-
-def create_dir(save_dir, is_file=False):
-    if is_file:
-        save_dir = os.path.dirname(save_dir)
-    os.makedirs(save_dir, exist_ok=True)
-
-
-def init_log():
-    logger = logging.getLogger("104data")
-    logger.setLevel(logging.INFO)
-    stout_handle = logging.StreamHandler(sys.stdout)
-    stout_handle.setFormatter(
-        logging.Formatter("%(asctime)s: %(message)s"))
-    stout_handle.setLevel(logging.INFO)
-    logger.addHandler(stout_handle)
-    create_dir(log_dir)
-    file_name = log_dir + os.sep + datetime.datetime.now().strftime('%Y%m') + '-info.log'
-    file_handler = logging.FileHandler(file_name, encoding='utf-8')
-    file_handler.setFormatter(
-        logging.Formatter("%(asctime)s: %(message)s"))
-    file_handler.setLevel(logging.INFO)
-    logger.addHandler(file_handler)
-
-    file_name = log_dir + os.sep + datetime.datetime.now().strftime('%Y%m') + '-error.log'
-    file_handler = logging.FileHandler(file_name, encoding='utf-8')
-    file_handler.setFormatter(
-        logging.Formatter("%(asctime)s: %(message)s"))
-    file_handler.setLevel(logging.ERROR)
-    logger.addHandler(file_handler)
-
-    return logger
-
-
-logger = init_log()
-
-
-def info_print(*kwargs):
-    message = " ".join([str(i) for i in kwargs])
-    logger.info(message)
-
-
-def error_print(*kwargs):
-    message = " ".join([str(i) for i in kwargs])
-    logger.error(message)
-
-
-def get_data_and_save_file(df_sql, save_path):
-    info_print(df_sql)
-    df = pd.read_sql_query(df_sql, engine)
-    info_print(df.shape)
-
-
-if __name__ == '__main__':
-    info_print("开始执行")
-    begin = datetime.datetime.now()
-    yestoday = (datetime.datetime.now() - datetime.timedelta(days=1)).strftime('%Y%m%d')
-    yestoday_sql = f"select * from information_schema.TABLES where TABLE_NAME = {yestoday}"
-
-    get_data_and_save_file(yestoday_sql,
-                           os.path.join(save_dir, wind_farm_code_dict['风场编号1'], yestoday[0:4], yestoday[0:6],
-                                        f'{yestoday}.csv.gz'))
-
-    info_print("执行结束,总耗时:", datetime.datetime.now() - begin)

+ 0 - 83
wind_farm/CGN/purge_history_data.py

@@ -1,83 +0,0 @@
-import datetime
-import logging
-import os
-import sys
-
-import pandas as pd
-from sqlalchemy import create_engine, text
-
-engine = create_engine('mysql+pymysql://root:admin123456@192.168.50.235:30306/appoint')
-
-base_dir = r'/data/logs/104'
-log_dir = base_dir + os.sep + 'logs' + os.sep + 'delete'
-
-
-def create_dir(save_dir, is_file=False):
-    if is_file:
-        save_dir = os.path.dirname(save_dir)
-    os.makedirs(save_dir, exist_ok=True)
-
-
-def init_log():
-    logger = logging.getLogger("104data")
-    logger.setLevel(logging.INFO)
-    stout_handle = logging.StreamHandler(sys.stdout)
-    stout_handle.setFormatter(
-        logging.Formatter("%(asctime)s: %(message)s"))
-    stout_handle.setLevel(logging.INFO)
-    logger.addHandler(stout_handle)
-    create_dir(log_dir)
-    file_name = log_dir + os.sep + datetime.datetime.now().strftime('%Y%m') + '-info.log'
-    file_handler = logging.FileHandler(file_name, encoding='utf-8')
-    file_handler.setFormatter(
-        logging.Formatter("%(asctime)s: %(message)s"))
-    file_handler.setLevel(logging.INFO)
-    logger.addHandler(file_handler)
-
-    file_name = log_dir + os.sep + datetime.datetime.now().strftime('%Y%m') + '-error.log'
-    file_handler = logging.FileHandler(file_name, encoding='utf-8')
-    file_handler.setFormatter(
-        logging.Formatter("%(asctime)s: %(message)s"))
-    file_handler.setLevel(logging.ERROR)
-    logger.addHandler(file_handler)
-
-    return logger
-
-
-logger = init_log()
-
-
-def info_print(*kwargs):
-    message = " ".join([str(i) for i in kwargs])
-    logger.info(message)
-
-
-def error_print(*kwargs):
-    message = " ".join([str(i) for i in kwargs])
-    logger.error(message)
-
-
-def drop_table(lastdays):
-    # 构建查询语句
-    query = text(
-        f"SELECT TABLE_NAME FROM information_schema.TABLES WHERE TABLE_SCHEMA='appoint' AND TABLE_NAME like '{lastdays}%'")
-    table_df = pd.read_sql(query, engine)
-
-    info_print('查询到表', table_df['TABLE_NAME'].values)
-    for table_name in table_df['TABLE_NAME'].values:
-        # 构建删除表的SQL语句
-        drop_query = text(f"DROP TABLE {table_name}")
-        # 执行删除操作
-        with engine.connect() as connection:
-            connection.execute(drop_query)
-
-        info_print(f"Table {table_name} deleted")
-
-
-if __name__ == '__main__':
-    info_print("开始执行")
-    begin = datetime.datetime.now()
-    lastdays = (datetime.datetime.now() - datetime.timedelta(days=8)).strftime('%Y%m%d')
-    print(lastdays)
-    drop_table(lastdays)
-    info_print("执行结束,总耗时:", datetime.datetime.now() - begin)

+ 0 - 173
wind_farm/CGN/second_data.py

@@ -1,173 +0,0 @@
-import datetime
-import json
-import logging
-import multiprocessing
-import os
-import traceback
-
-import sys
-
-import numpy as np
-import pandas as pd
-from sqlalchemy import create_engine
-
-engine = create_engine('mysql+pymysql://root:admin123456@192.168.50.235:30306/appoint')
-
-base_dir = r'/data/logs/104'
-save_dir = base_dir + os.sep + 'second'
-log_dir = base_dir + os.sep + 'logs' + os.sep + 'second'
-
-def create_dir(save_dir, is_file=False):
-    if is_file:
-        save_dir = os.path.dirname(save_dir)
-    os.makedirs(save_dir, exist_ok=True)
-
-
-def init_log():
-    logger = logging.getLogger("104data")
-    logger.setLevel(logging.INFO)
-    stout_handle = logging.StreamHandler(sys.stdout)
-    stout_handle.setFormatter(
-        logging.Formatter("%(asctime)s: %(message)s"))
-    stout_handle.setLevel(logging.INFO)
-    logger.addHandler(stout_handle)
-    create_dir(log_dir)
-    file_name = log_dir + os.sep + datetime.datetime.now().strftime('%Y%m') + '-info.log'
-    file_handler = logging.FileHandler(file_name, encoding='utf-8')
-    file_handler.setFormatter(
-        logging.Formatter("%(asctime)s: %(message)s"))
-    file_handler.setLevel(logging.INFO)
-    logger.addHandler(file_handler)
-
-    file_name = log_dir + os.sep + datetime.datetime.now().strftime('%Y%m') + '-error.log'
-    file_handler = logging.FileHandler(file_name, encoding='utf-8')
-    file_handler.setFormatter(
-        logging.Formatter("%(asctime)s: %(message)s"))
-    file_handler.setLevel(logging.ERROR)
-    logger.addHandler(file_handler)
-
-    return logger
-
-
-logger = init_log()
-
-
-def get_all_mesurement_conf():
-    sql = "select * from measurement_conf "
-    return pd.read_sql(sql, engine)
-
-
-def get_all_mesurepoint_conf():
-    sql = "select * from measurepoint_conf t where t.status = 1"
-    return pd.read_sql(sql, engine)
-
-
-def df_value_to_dict(df, key='col1', value='col2'):
-    """
-    :param df: dataframe
-    :param key: 字典的key,如果重复,则返回
-    :param value: 字典的value
-    :return:
-    """
-    result_dict = dict()
-    for k, v in zip(df[key], df[value]):
-        if k in result_dict.keys():
-            if type(result_dict[k]) == list:
-                result_dict[k].append(v)
-            else:
-                result_dict[k] = [result_dict[k]]
-                result_dict[k].append(v)
-        else:
-            result_dict[k] = v
-
-    return result_dict
-
-
-def info_print(*kwargs):
-    message = " ".join([str(i) for i in kwargs])
-    logger.info(message)
-
-
-def error_print(*kwargs):
-    message = " ".join([str(i) for i in kwargs])
-    logger.error(message)
-
-
-def exists_table(table_name):
-    sql = f"SELECT * FROM information_schema.tables WHERE table_schema = 'appoint' AND table_name = '{table_name}'"
-    info_print(sql)
-    table_df = pd.read_sql_query(sql, engine)
-    if table_df.empty:
-        return False
-    return True
-
-
-def get_data_and_save_file(table_name, save_path, measurepoint_use_dict):
-    if not exists_table(table_name):
-        error_print(f"{table_name} 表不存在")
-    else:
-        df_sql = f"SELECT * FROM {table_name}"
-        info_print(df_sql)
-        df = pd.read_sql_query(df_sql, engine)
-        info_print(df.shape)
-
-        data_dict = dict()
-        for receive_time, information_object_data in zip(df['receive_time'],
-                                                         df['information_object_data']):
-
-            json_data = json.loads(information_object_data)
-            for k, v in json_data.items():
-                k = int(k)
-                wind_num = k // 103 + 1
-                mesurepoint_num = k % 103
-
-                if wind_num not in data_dict.keys():
-                    data_dict[wind_num] = dict()
-
-                if receive_time not in data_dict[wind_num].keys():
-                    data_dict[wind_num][receive_time] = dict()
-
-                if mesurepoint_num in measurepoint_use_dict.keys():
-                    data_dict[wind_num][receive_time][mesurepoint_num] = v
-
-        datas = list()
-        for wind_num, data in data_dict.items():
-            for receive_time, mesurepoint_data in data.items():
-                data = [wind_num, receive_time]
-                for point_num in measurepoint_use_dict.keys():
-                    data.append(mesurepoint_data[point_num] if point_num in mesurepoint_data.keys() else np.nan)
-                if len(data) > 2:
-                    datas.append(data)
-
-        cols = ['风机编号', '时间']
-        cols.extend(measurepoint_use_dict.values())
-        result_df = pd.DataFrame(data=datas, columns=cols)
-        result_df.sort_values(by=['风机编号', '时间'])
-        create_dir(save_path, True)
-        result_df.to_csv(save_path, encoding='utf8', index=False, compression='gzip')
-        info_print("文件", save_path, '保存成功')
-
-
-if __name__ == '__main__':
-    info_print("开始执行")
-    begin = datetime.datetime.now()
-    try:
-        measurepoint_conf_df = get_all_mesurepoint_conf()
-        measurepoint_use_dict = df_value_to_dict(measurepoint_conf_df, 'id', 'name')
-
-        yestoday = (datetime.datetime.now() - datetime.timedelta(days=1)).strftime('%Y%m%d')
-
-        measurement_conf_df = get_all_mesurement_conf()
-        tables = list()
-        for id, measurement_wind_field in zip(measurement_conf_df['id'], measurement_conf_df['measurement_wind_field']):
-            tables.append(
-                (f'{yestoday}_{id}', os.path.join(save_dir, measurement_wind_field, yestoday[0:4], yestoday[0:6],
-                                                  yestoday + '.csv.gz')))
-
-        with multiprocessing.Pool(len(tables)) as pool:
-            pool.starmap(get_data_and_save_file, [(t[0], t[1], measurepoint_use_dict) for t in tables])
-    except Exception as e:
-        error_print(traceback.format_exc())
-        raise e
-
-    info_print("执行结束,总耗时:", datetime.datetime.now() - begin)

+ 0 - 0
wind_farm/__init__.py