zhzn
/
energy-data-trans


			
							123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226
							import multiprocessing
import traceback
from os import path

import numpy as np
import pandas as pd

from conf.constants import DataProcessing, ParallelProcessing, Types
from etl.common.PathsAndTable import PathsAndTable
from etl.wind_power.min_sec import TransParam
from etl.wind_power.min_sec.ClassIdentifier import ClassIdentifier
from etl.wind_power.min_sec.FilterValidData import FilterValidData
from service.trans_conf_service import update_trans_transfer_progress
from utils.conf.read_conf import read_conf
from utils.df_utils.util import estimate_time_interval as get_time_space
from utils.file.trans_methods import create_file_path, read_excel_files, read_file_to_df
from utils.log.trans_log import debug, error
from utils.systeminfo.sysinfo import use_files_get_max_cpu_count

exec("import math")


class StatisticsAndSaveTmpFormalFile(object):

    def __init__(self, paths_and_table: PathsAndTable, trans_param: TransParam, statistics_map,
                 rated_power_and_cutout_speed_map):
        self.paths_and_table = paths_and_table
        self.trans_param = trans_param
        self.statistics_map = statistics_map
        self.lock = multiprocessing.Manager().Lock()
        self.rated_power_and_cutout_speed_map = rated_power_and_cutout_speed_map

    def set_statistics_data(self, df):

        if not df.empty:
            df['time_stamp'] = pd.to_datetime(df['time_stamp'])
            min_date = df['time_stamp'].min()
            max_date = df['time_stamp'].max()
            with self.lock:

                if 'min_date' in self.statistics_map.keys():
                    if self.statistics_map['min_date'] > min_date:
                        self.statistics_map['min_date'] = min_date
                else:
                    self.statistics_map['min_date'] = min_date

                if 'max_date' in self.statistics_map.keys():
                    if self.statistics_map['max_date'] < max_date:
                        self.statistics_map['max_date'] = max_date
                else:
                    self.statistics_map['max_date'] = max_date

                if 'total_count' in self.statistics_map.keys():
                    self.statistics_map['total_count'] = self.statistics_map['total_count'] + df.shape[0]
                else:
                    self.statistics_map['total_count'] = df.shape[0]

                if 'time_granularity' not in self.statistics_map.keys():
                    self.statistics_map['time_granularity'] = get_time_space(df, 'time_stamp')

    def save_to_csv(self, filename):
        df = read_file_to_df(filename)
        if self.trans_param.is_vertical_table:
            df = df.pivot_table(index=['time_stamp', 'wind_turbine_number'], columns=self.trans_param.vertical_key,
                                values=self.trans_param.vertical_value,
                                aggfunc='max')
            # 重置索引以得到普通的列
            df.reset_index(inplace=True)

        # 转化风机名称
        origin_wind_name = str(df['wind_turbine_number'].values[0])
        df['wind_turbine_number'] = df['wind_turbine_number'].astype('str')
        # df['wind_turbine_name'] = df['wind_turbine_number']
        df['wind_turbine_number'] = df['wind_turbine_number'].map(
            self.trans_param.wind_col_trans).fillna(df['wind_turbine_number'])
        wind_col_name = str(df['wind_turbine_number'].values[0])

        not_double_cols = DataProcessing.NOT_DOUBLE_COLS

        # 删除 有功功率 和 风速均为空的情况
        df.dropna(subset=['active_power', 'wind_velocity'], how='any', inplace=True)
        debug(origin_wind_name, wind_col_name, "删除有功功率和风速有空的情况后:", df.shape)
        df.replace(np.nan, DataProcessing.NAN_REPLACE_VALUE, inplace=True)
        number_cols = df.select_dtypes(include=['number']).columns.tolist()
        for col in df.columns:
            if col not in not_double_cols and col not in number_cols:
                if not df[col].isnull().all():
                    df[col] = pd.to_numeric(df[col], errors='coerce')
                    # 删除包含NaN的行（即那些列A转换失败的行）
                    df = df.dropna(subset=[col])
                    debug(origin_wind_name, wind_col_name, "删除非数值列名:", col)
        df.replace(DataProcessing.NAN_REPLACE_VALUE, np.nan, inplace=True)

        df.drop_duplicates(['wind_turbine_number', 'time_stamp'], keep='first', inplace=True)

        df['time_stamp'] = df['time_stamp'].str.strip()
        df['time_stamp'] = pd.to_datetime(df['time_stamp'], errors="coerce")
        df.dropna(subset=['time_stamp'], inplace=True)
        df.sort_values(by='time_stamp', inplace=True)
        df = df[[i for i in self.trans_param.cols_tran.keys() if i in df.columns]]

        # 删除每行有空值的行(2025-3-24)
        # origin_count = df.shape[0]
        # df = df.dropna()
        # trans_print(f"原始数据量:{origin_count},去除na后数据量:{df.shape[0]}")

        # 如果秒级有可能合并到分钟级
        # TODO add 秒转分钟
        if self.trans_param.boolean_sec_to_min:
            df['time_stamp'] = df['time_stamp'].apply(lambda x: x + pd.Timedelta(minutes=(10 - x.minute % 10) % 10))
            df['time_stamp'] = df['time_stamp'].dt.floor(DataProcessing.TIME_INTERVAL)
            df = df.groupby(['wind_turbine_number', 'time_stamp']).mean().reset_index()
        debug('有功功率前10个', df.head(10)['active_power'].values)
        power_df = df[df['active_power'] > 0]
        debug(origin_wind_name, wind_col_name, "功率大于0的数量:", power_df.shape)
        power = power_df.sample(int(power_df.shape[0] / 100))['active_power'].median()

        debug(origin_wind_name, wind_col_name, '有功功率，中位数', power)
        if power > DataProcessing.POWER_UNIT_THRESHOLD:
            df['active_power'] = df['active_power'] / 1000
        # 做数据检测前,羡强行处理有功功率
        # df = df[df['active_power'] < 50000]

        rated_power_and_cutout_speed_tuple = read_conf(self.rated_power_and_cutout_speed_map, str(wind_col_name))
        if rated_power_and_cutout_speed_tuple is None:
            # rated_power_and_cutout_speed_tuple = (None, None)
            error(origin_wind_name, '未从平台匹配到额定功率')
        else:
            debug(origin_wind_name, '过滤数据前数据大小', df.shape)
            debug(origin_wind_name, '额定功率', rated_power_and_cutout_speed_tuple[0])
            # trans_print(origin_wind_name, '\n', df.head(10))
            filter_valid_data = FilterValidData(df, rated_power_and_cutout_speed_tuple[0])
            try:
                df = filter_valid_data.run()
            except:
                error(origin_wind_name, '过滤数据异常', filename)
                raise
            debug(origin_wind_name, '过滤数据后数据大小', df.shape)

            # 如果有需要处理的,先进行代码处理,在进行打标签
            # exec_code = get_trans_exec_code(self.paths_and_table.exec_id, self.paths_and_table.read_type)
            # if exec_code:
            #     if 'import ' in exec_code:
            #         raise Exception("执行代码不支持导入包")
            #     exec(exec_code)

            if power_df.shape[0] == 0:
                df.loc[:, 'lab'] = -1
            else:
                class_identifier = ClassIdentifier(wind_turbine_number=origin_wind_name, origin_df=df,
                                                   rated_power=rated_power_and_cutout_speed_tuple[0],
                                                   cut_out_speed=rated_power_and_cutout_speed_tuple[1])
                df = class_identifier.run()

            del power_df

            df['year'] = df['time_stamp'].dt.year
            df['month'] = df['time_stamp'].dt.month
            df['day'] = df['time_stamp'].dt.day
            df['time_stamp'] = df['time_stamp'].apply(lambda x: x.strftime('%Y-%m-%d %H:%M:%S'))

            df['wind_turbine_name'] = str(origin_wind_name)
            df['year_month'] = df[['year', 'month']].apply(lambda x: str(x['year']) + str(x['month']).zfill(2), axis=1)
            cols = df.columns

            if self.paths_and_table.read_type == Types.SECOND:
                type_col = 'year_month'
            else:
                type_col = 'year'

            date_strs = df[type_col].unique().tolist()
            for date_str in date_strs:
                save_path = path.join(self.paths_and_table.get_tmp_formal_path(), str(date_str),
                                      str(origin_wind_name) + '.csv')
                create_file_path(save_path, is_file_path=True)
                now_df = df[df[type_col] == date_str][cols]
                if self.paths_and_table.save_zip:
                    save_path = save_path + '.gz'
                    now_df.to_csv(save_path, compression='gzip', index=False, encoding='utf-8')
                else:
                    now_df.to_csv(save_path, index=False, encoding='utf-8')

                del now_df

            self.set_statistics_data(df)

            del df
            debug("保存" + str(wind_col_name) + "成功")

    def multiprocessing_to_save_file(self):
        # 开始保存到正式文件
        all_tmp_files = read_excel_files(self.paths_and_table.get_read_tmp_path())

        if not all_tmp_files:
            debug("没有临时文件需要处理")
            return

        # 计算最佳进程数
        max_processes = use_files_get_max_cpu_count(all_tmp_files)
        max_processes = min(max_processes, len(all_tmp_files), ParallelProcessing.MAX_PROCESSES)  # 限制最大进程数

        try:
            # 创建一个进程池处理所有文件
            with multiprocessing.Pool(max_processes) as pool:
                # 分批次处理并更新进度
                batch_size = max(1, len(all_tmp_files) // ParallelProcessing.MAX_BATCHES)  # 最多10个批次

                for i in range(0, len(all_tmp_files), batch_size):
                    batch_files = all_tmp_files[i:i + batch_size]
                    pool.starmap(self.save_to_csv, [(file,) for file in batch_files])

                    # 更新进度
                    progress = 50 + 15 * (i + len(batch_files)) / len(all_tmp_files)
                    update_trans_transfer_progress(self.paths_and_table.id,
                                                   round(progress, 2),
                                                   self.paths_and_table.save_db)

        except Exception as e:
            error(traceback.format_exc())
            message = "保存文件错误,系统返回错误:" + str(e)
            raise ValueError(message)

    def run(self):
        self.multiprocessing_to_save_file()
        update_trans_transfer_progress(self.paths_and_table.id, 65,
                                       self.paths_and_table.save_db)