Jelajahi Sumber

删除每行有空值的行(2025-3-24)

魏志亮 4 bulan lalu
induk
melakukan
332d9c8644
46 mengubah file dengan 5552 tambahan dan 0 penghapusan
  1. 5 0
      etl/wind_power/min_sec/StatisticsAndSaveTmpFormalFile.py
  2. 755 0
      tmp_file/ClassIdentifier_1.py_bak
  3. 196 0
      tmp_file/baiyushan_20240906.py
  4. 48 0
      tmp_file/changing_hebing_guzhang.py
  5. 95 0
      tmp_file/cp_file.py
  6. 94 0
      tmp_file/cp_online_data_to_other.py
  7. 47 0
      tmp_file/curge_read.py
  8. 40 0
      tmp_file/error_ms_data.py
  9. 120 0
      tmp_file/extrace_month_data.py
  10. 57 0
      tmp_file/fengxiang_fengdianchang.py
  11. 48 0
      tmp_file/filter_lose_data.py
  12. 205 0
      tmp_file/gradio_web.py
  13. 28 0
      tmp_file/hebing_matlib_result.py
  14. 77 0
      tmp_file/hebing_muti_batch.py
  15. 173 0
      tmp_file/organize_xinhua_files.py
  16. 205 0
      tmp_file/organize_xinhua_files_data.py
  17. 97 0
      tmp_file/orgranize_hongyang.py
  18. 91 0
      tmp_file/power_derating.py
  19. 90 0
      tmp_file/power_derating_biaozhun.py
  20. 213 0
      tmp_file/power_derating_for_chunlin.py
  21. 262 0
      tmp_file/pv_youxiaoxing.py
  22. 134 0
      tmp_file/qinghai-nuomuhong-guifan.py
  23. 162 0
      tmp_file/qinghai-nuomuhong.py
  24. 208 0
      tmp_file/qitaihe_biaozhunhua.py
  25. 139 0
      tmp_file/qitaihe_biaozhunhua_minute.py
  26. 38 0
      tmp_file/queshi_bili.py
  27. 42 0
      tmp_file/read_and_draw_png.py
  28. 27 0
      tmp_file/select_part_cols.py
  29. 114 0
      tmp_file/taipingli_biaozhunhua.py
  30. 19 0
      tmp_file/test_wave.py
  31. 55 0
      tmp_file/zibo_guzhang_select_time.py
  32. 87 0
      tmp_file/压缩内读取.py
  33. 155 0
      tmp_file/大唐玉湖-箱变.py
  34. 158 0
      tmp_file/大唐玉湖数据整理.py
  35. 209 0
      tmp_file/大唐玉湖数据整理_1.py
  36. 283 0
      tmp_file/大唐玉湖数据整理_2.py
  37. 122 0
      tmp_file/大唐玉湖气象合并.py
  38. 96 0
      tmp_file/年度平均缺失率.py
  39. 106 0
      tmp_file/张崾先振动.py
  40. 32 0
      tmp_file/张崾先故障.py
  41. 67 0
      tmp_file/张崾先统计缺失率-分.py
  42. 92 0
      tmp_file/张崾先统计缺失率.py
  43. 31 0
      tmp_file/故障时间整理.py
  44. 97 0
      tmp_file/新华水电列名对比.py
  45. 35 0
      tmp_file/白玉山限电损失.py
  46. 98 0
      tmp_file/陕西建工陕西智华.py

+ 5 - 0
etl/wind_power/min_sec/StatisticsAndSaveTmpFormalFile.py

@@ -97,6 +97,11 @@ class StatisticsAndSaveTmpFormalFile(object):
         df.sort_values(by='time_stamp', inplace=True)
         df = df[[i for i in self.trans_param.cols_tran.keys() if i in df.columns]]
 
+        # 删除每行有空值的行
+        origin_count = df.shape[0]
+        df = df.dropna()
+        trans_print(f'原始数据量:{origin_count},去除na后数据量:{df.shape[0]}')
+
         # 如果秒级有可能合并到分钟级
         # TODO add 秒转分钟
         if self.trans_param.boolean_sec_to_min:

+ 755 - 0
tmp_file/ClassIdentifier_1.py_bak

@@ -0,0 +1,755 @@
+import numpy as np
+from pandas import DataFrame
+
+from service.plt_service import get_base_wind_and_power
+from utils.file.trans_methods import read_file_to_df
+
+
+class ClassIdentifier(object):
+
+    def __init__(self, wind_turbine_number, file_path: str = None, origin_df: DataFrame = None, index='time_stamp',
+                 wind_velocity='wind_velocity',
+                 active_power='active_power'):
+        """
+        :param wind_turbine_number: The wind turbine number.
+        :param file_path: The file path of the input data.
+        :param origin_df: The pandas DataFrame containing the input data.
+        :param index: 索引字段
+        :param wind_velocity: 风速字段
+        :param active_power: 有功功率字段
+        """
+        self.wind_turbine_number = wind_turbine_number
+        self.index = index
+        self.wind_velocity = wind_velocity
+        self.active_power = active_power
+
+        self.rated_wind_speed = 'rated_wind_speed'
+        self.rated_capacity = 'rated_capacity'
+
+        if file_path is None and origin_df is None:
+            raise ValueError("Either file_path or origin_df should be provided.")
+
+        if file_path:
+            self.df = read_file_to_df(file_path)
+        else:
+            self.df = origin_df
+
+        self.df = self.df.set_index(keys=self.index)
+
+    def identifier(self):
+        # 风速 和 有功功率 df
+        wind_and_power_df = self.df[[self.wind_velocity, self.active_power]]
+        wind_and_power_df.reset_index(inplace=True)
+        wind_and_power_df_count = wind_and_power_df.shape[0]
+        PowerMax = wind_and_power_df[self.active_power].max()
+        PowerRated = np.ceil(PowerMax / 100) * 100
+        PRated = 1500  # 额定功率1500kw,可改为2000kw
+        VCutOut = 25
+        VCutIn = 3
+        VRated = 10
+        # 网格法确定风速风向分区数量,功率方向分区数量,
+        # PNum = (PRated+100)/25  #功率分区间隔25kW
+        PNum = int(np.ceil(PowerRated / 25))  # 功率分区间隔25kW
+        VNum = int(np.ceil(VCutOut / 0.25))  # 风速分区间隔0.25m/s
+
+        # 实发电量
+        EPActualTotal = 0  # 实发电量
+        for i in range(wind_and_power_df_count):
+            if wind_and_power_df.loc[i, self.active_power] >= 0:
+                EPActualTotal = EPActualTotal + wind_and_power_df.loc[i, self.active_power] / 6
+
+        print("EPActualTotal", EPActualTotal)
+        # 平均风速
+        WindSpeedAvr = 0
+        WindSum = 0
+        for i in range(wind_and_power_df_count):
+            if wind_and_power_df.loc[i, self.wind_velocity] >= 0:
+                WindSum = WindSum + wind_and_power_df.loc[i, self.wind_velocity]
+        WindSpeedAvr = WindSum / wind_and_power_df_count
+        print("windSpeedAvr", WindSpeedAvr)
+        # 用于计算损失电量的标杆功率曲线,可更换为风机设计功率曲线
+        # base_wind_and_power_df = get_base_wind_and_power(self.wind_turbine_number)
+        base_wind_and_power_df = read_file_to_df(r"D:\中能智能\matlib计算相关\好点坏点matlib计算\A型风机设计功率曲线.csv", header=None)
+        base_wind_and_power_df.columns = [self.rated_wind_speed, self.rated_capacity]
+        if base_wind_and_power_df.empty:
+            raise ValueError("风场编号:" + self.wind_turbine_number + "未查询到风速功率信息")
+        base_wind_and_power_count = base_wind_and_power_df.shape[0]
+
+        # 风机可利用率,计算方法:大于切入风速但发电功率小于0
+        TurbineRunRate = 0
+        nShouldGP = 0
+        nRealGP = 0
+        for i in range(wind_and_power_df_count):
+            if wind_and_power_df.loc[i, self.wind_velocity] >= VCutIn:
+                nShouldGP = nShouldGP + 1
+                if wind_and_power_df.loc[i, self.active_power] > 0:
+                    nRealGP = nRealGP + 1
+        if nShouldGP > 0:
+            TurbineRunRate = nRealGP / nShouldGP * 100
+
+        print("disp(TurbineRunRate)", TurbineRunRate)
+        # 理论电量-
+        EPIdealTotalAAA = 0  # 理论电量-
+        nWhichBin = 0
+        IdealPower = 0
+        for i in range(wind_and_power_df_count):
+            # 应发电量-理论
+            nWhichBin = 0
+            for m in range(base_wind_and_power_count - 1):
+                if base_wind_and_power_df.loc[m, self.rated_wind_speed] < wind_and_power_df.loc[
+                    i, self.wind_velocity] <= \
+                        base_wind_and_power_df.loc[m + 1, self.rated_wind_speed]:
+                    nWhichBin = m
+                    break
+
+            # 插值计算对应设计功率
+            if nWhichBin > base_wind_and_power_count - 1 or nWhichBin == 0:
+                continue
+
+            IdealPower = (wind_and_power_df.loc[i, self.wind_velocity] - base_wind_and_power_df.loc[nWhichBin,
+                                                                                                    self.rated_wind_speed]) / (
+                                 base_wind_and_power_df.loc[nWhichBin + 1, self.rated_wind_speed] -
+                                 base_wind_and_power_df.loc[nWhichBin, self.rated_wind_speed]) * (
+                                 base_wind_and_power_df.loc[nWhichBin + 1, self.rated_capacity] -
+                                 base_wind_and_power_df.loc[nWhichBin, self.rated_capacity]) \
+                         + base_wind_and_power_df.loc[nWhichBin, self.rated_capacity]
+            EPIdealTotalAAA = EPIdealTotalAAA + IdealPower / 6
+
+        print('EPIdealTotalAAA', EPIdealTotalAAA)
+        #
+        # 存储功率大于零的运行数据
+        DzMarch809 = np.zeros([wind_and_power_df_count, 2], dtype=float)
+        nCounter1 = 0
+        for i in range(wind_and_power_df_count):
+            if wind_and_power_df.loc[i, self.active_power] > 0:
+                DzMarch809[nCounter1, 0] = wind_and_power_df.loc[i, self.wind_velocity]
+                DzMarch809[nCounter1, 1] = wind_and_power_df.loc[i, self.active_power]
+
+                nCounter1 = nCounter1 + 1
+
+        print('nCounter1', nCounter1)
+
+        # 统计各网格落入的散点个数
+        XBoxNumber = np.ones([PNum, VNum], dtype=int)
+        nWhichP = -1
+        nWhichV = -1
+        for i in range(nCounter1):
+            for m in range(PNum):
+                if m * 25 < DzMarch809[i, 1] <= (m + 1) * 25:
+                    nWhichP = m
+                    break
+            for n in range(VNum):
+                if ((n + 1) * 0.25 - 0.125) < DzMarch809[i, 0] <= ((n + 1) * 0.25 + 0.125):
+                    nWhichV = n
+                    break
+
+            if nWhichP > -1 and nWhichV > -1:
+                XBoxNumber[nWhichP, nWhichV] = XBoxNumber[nWhichP, nWhichV] + 1
+
+        for m in range(PNum):
+            for n in range(VNum):
+                XBoxNumber[m, n] = XBoxNumber[m, n] - 1
+
+        print('XBoxNumber', XBoxNumber)
+        # 在功率方向将网格内散点绝对个数转换为相对百分比,备用
+        PBoxPercent = np.zeros([PNum, VNum], dtype=float)
+        PBinSum = np.zeros(PNum, dtype=int)
+
+        for i in range(PNum):
+            for m in range(VNum):
+                PBinSum[i] = PBinSum[i] + XBoxNumber[i, m]
+
+            for m in range(VNum):
+                if PBinSum[i] > 0:
+                    PBoxPercent[i, m] = XBoxNumber[i, m] / PBinSum[i] * 100
+
+        # 在风速方向将网格内散点绝对个数转换为相对百分比,备用
+        VBoxPercent = np.zeros([PNum, VNum], dtype=float)
+        VBinSum = np.zeros(VNum, dtype=int)
+
+        for i in range(VNum):
+            for m in range(PNum):
+                VBinSum[i] = VBinSum[i] + XBoxNumber[m, i]
+
+            for m in range(PNum):
+                if VBinSum[i] > 0:
+                    VBoxPercent[m, i] = XBoxNumber[m, i] / VBinSum[i] * 100
+
+        # 以水平功率带方向为准,分析每个水平功率带中,功率主带中心,即找百分比最大的网格位置。
+        PBoxMaxIndex = np.zeros(PNum, dtype=int)  # 水平功率带最大网格位置索引
+        PBoxMaxP = np.zeros(PNum, dtype=int)  # 水平功率带最大网格百分比
+
+        for m in range(PNum):
+            # 确定每一水平功率带的最大网格位置索引即百分比值
+            PBoxMaxP[m], PBoxMaxIndex[m] = PBoxPercent[m, :].max(), PBoxPercent[m, :].argmax()
+
+        # 以垂直风速方向为准,分析每个垂直风速带中,功率主带中心,即找百分比最大的网格位置。
+        VBoxMaxIndex = np.zeros(VNum, dtype=int)
+        VBoxMaxV = np.zeros(VNum, dtype=int)
+
+        for m in range(VNum):
+            [VBoxMaxV[m], VBoxMaxIndex[m]] = VBoxPercent[:, m].max(), VBoxPercent[:, m].argmax()
+
+        # 切入风速特殊处理,如果切入风速过于偏右,向左拉回
+        if PBoxMaxIndex[0] > 14:
+            PBoxMaxIndex[0] = 9
+
+        # 以水平功率带方向为基准,进行分析
+        DotDense = np.zeros(PNum, dtype=int)  # 每一水平功率带的功率主带包含的网格数
+        DotDenseLeftRight = np.zeros([PNum, 2], dtype=int)  # 存储每一水平功率带的功率主带以最大网格为中心,向向左,向右扩展的网格数
+        DotValve = 90  # 从中心向左右对称扩展网格的散点百分比和的阈值。
+        PDotDenseSum = 0
+
+        iSpreadLeft = 1  # 向左扩展网格计数,初值为1
+        iSpreadRight = 1  # 向右扩展网格技术,初值为1
+        for i in range(PNum - 6):  # 从最下层水平功率带1开始,向上到第PNum-6个水平功率带(额定功率一下水平功率带),逐一分析
+            PDotDenseSum = PBoxMaxP[i]  # 以中心最大水平功率带为基准,向左向右对称扩展网格,累加各网格散点百分比
+            iSpreadRight = 1
+            iSpreadLeft = 1
+            while PDotDenseSum < DotValve:
+
+                if (PBoxMaxIndex[i] + iSpreadRight) < VNum - 1:
+                    PDotDenseSum = PDotDenseSum + PBoxPercent[i, PBoxMaxIndex[i] + iSpreadRight]  # 向右侧扩展
+                    iSpreadRight = iSpreadRight + 1
+
+                if (PBoxMaxIndex[i] + iSpreadRight) > VNum - 1:
+                    break
+
+                if (PBoxMaxIndex[i] - iSpreadLeft) > 0:
+                    PDotDenseSum = PDotDenseSum + PBoxPercent[i, PBoxMaxIndex[i] - iSpreadLeft]  # 向左侧扩展
+                    iSpreadLeft = iSpreadLeft + 1
+
+                if (PBoxMaxIndex[i] - iSpreadLeft) <= 0:
+                    break
+
+            iSpreadRight = iSpreadRight - 1
+
+            iSpreadLeft = iSpreadLeft - 1
+            # 向左右对称扩展完毕
+
+            DotDenseLeftRight[i, 0] = iSpreadLeft
+            DotDenseLeftRight[i, 1] = iSpreadRight
+            DotDense[i] = iSpreadLeft + iSpreadRight + 1
+
+        # 各行功率主带右侧宽度的中位数最具有代表性
+        DotDenseWidthLeft = np.zeros([PNum - 6, 1], dtype=int)
+        for i in range(PNum - 6):
+            DotDenseWidthLeft[i] = DotDenseLeftRight[i, 1]
+
+        MainBandRight = np.median(DotDenseWidthLeft)
+
+        # 散点向右显著延展分布的水平功率带为限功率水平带
+        PowerLimit = np.zeros([PNum, 1], dtype=int)  # 各水平功率带是否为限功率标识,==1:是;==0:不是
+        WidthAverage = 0  # 功率主带平均宽度
+        WidthVar = 0  # 功率主带方差
+        # PowerLimitValve = 6    #限功率主带判别阈值
+        PowerLimitValve = np.ceil(MainBandRight) + 3  # 限功率主带判别阈值
+
+        nCounterLimit = 0
+        nCounter = 0
+
+        for i in range(PNum - 6):
+            if DotDenseLeftRight[i, 1] > PowerLimitValve and PBinSum[i] > 20:  # 如果向右扩展网格数大于阈值,且该水平功率带点总数>20,是
+                PowerLimit[i] = 1
+                nCounterLimit = nCounterLimit + 1
+
+            if DotDenseLeftRight[i, 1] <= PowerLimitValve:
+                WidthAverage = WidthAverage + DotDenseLeftRight[i, 1]  # 统计正常水平功率带右侧宽度
+                nCounter = nCounter + 1
+
+        WidthAverage = WidthAverage / nCounter  # 功率主带平均宽度
+
+        print("WidthAverage", WidthAverage)
+
+        # 各水平功率带的功率主带宽度的方差,反映从下到上宽度是否一致,或是否下宽上窄等异常情况
+        for i in range(PNum - 6):
+            if DotDenseLeftRight[i, 1] <= PowerLimitValve:
+                WidthVar = WidthVar + (DotDenseLeftRight[i, 1] - WidthAverage) * (
+                        DotDenseLeftRight[i, 1] - WidthAverage)
+
+        WidthVar = np.sqrt(WidthVar / nCounter)
+
+        # 各水平功率带,功率主带的风速范围,右侧扩展网格数*2*0.25
+        PowerBandWidth = WidthAverage * 2 * 0.25
+
+        # 对限负荷水平功率带的最大网格较下面相邻层显著偏右,拉回
+        for i in range(1, PNum - 6):
+            if PowerLimit[i] == 1 and abs(PBoxMaxIndex[i] - PBoxMaxIndex[i - 1]) > 5:
+                PBoxMaxIndex[i] = PBoxMaxIndex[i - 1] + 1
+
+        # 输出各层功率主带的左右边界网格索引
+        DotDenseInverse = np.zeros([PNum, 2], dtype=int)
+
+        for i in range(PNum):
+            DotDenseInverse[i, :] = DotDenseLeftRight[PNum - i - 1, :]
+
+        # print('DotDenseInverse', DotDenseInverse)
+
+        # 功率主带的右边界
+        CurveWidthR = int(np.ceil(WidthAverage) + 2)
+
+        # CurveWidthL = 6    #功率主带的左边界
+        CurveWidthL = CurveWidthR
+
+        BBoxLimit = np.zeros([PNum, VNum], dtype=int)  # 网格是否为限功率网格的标识,如果为限功率水平功率带,从功率主带右侧边缘向右的网格为限功率网格
+        for i in range(2, PNum - 6):
+            if PowerLimit[i] == 1:
+                for j in range(PBoxMaxIndex[i] + CurveWidthR, VNum):
+                    BBoxLimit[i, j] = 1
+
+        BBoxRemove = np.zeros([PNum, VNum], dtype=int)  # 数据异常需要剔除的网格标识,标识==1:功率主带右侧的欠发网格;==2:功率主带左侧的超发网格
+        for m in range(PNum - 6):
+            for n in range(PBoxMaxIndex[m] + CurveWidthR - 1, VNum):
+                BBoxRemove[m, n] = 1
+
+            for n in range(PBoxMaxIndex[m] - CurveWidthL - 1, 0, -1):
+                BBoxRemove[m, n] = 2
+
+        # 确定功率主带的左上拐点,即额定风速位置的网格索引
+        CurveTop = np.zeros(2, dtype=int)
+        CurveTopValve = 3  # 网格的百分比阈值
+        BTopFind = 0
+        for m in range(PNum - 4 - 1, 0, -1):
+            for n in range(VNum):
+                if VBoxPercent[m, n] > CurveTopValve and XBoxNumber[m, n] >= 10:  # 如左上角网格的百分比和散点个数大于阈值。
+                    CurveTop[0] = m
+                    CurveTop[1] = n
+                    BTopFind = 1
+                    break
+
+            if BTopFind == 1:
+                break
+
+        IsolateValve = 3
+        for m in range(PNum - 6):
+            for n in range(PBoxMaxIndex[m] + CurveWidthR - 1, VNum):
+                if PBoxPercent[m, n] < IsolateValve:
+                    BBoxRemove[m, n] = 1
+
+        # 功率主带顶部宽度
+        CurveWidthT = 2
+        for m in range(PNum - CurveWidthT - 1, PNum):
+            for n in range(VNum):
+                BBoxRemove[m, n] = 3  # 网格为额定功率以上的超发点
+
+        # 功率主带拐点左侧的欠发网格标识
+        for m in range(PNum - 5 - 1, PNum):
+            for n in range(CurveTop[1] - 2 - 1):
+                BBoxRemove[m, n] = 2
+
+        # 以网格的标识,决定该网格内数据的标识。Dzwind_and_power_dfSel功率非零数据的标识位。散点在哪个网格,此网格的标识即为该点的标识
+        Dzwind_and_power_dfSel = np.zeros(nCounter1, dtype=int)  # is ==1,欠发功率点;==2,超发功率点;==3,额定风速以上的超发功率点 ==4, 限电
+        nWhichP = 0
+        nWhichV = 0
+        nBadA = 0
+
+        for i in range(nCounter1):
+            for m in range(PNum):
+                if DzMarch809[i, 1] > (m - 1) * 25 and DzMarch809[i, 1] <= m * 25:
+                    nWhichP = m
+                    break
+
+            for n in range(VNum):
+                if DzMarch809[i, 0] > (n * 0.25 - 0.125) and DzMarch809[i, 0] <= (n * 0.25 + 0.125):
+                    nWhichV = n
+                    break
+
+            if nWhichP > 0 and nWhichV > 0:
+
+                if BBoxRemove[nWhichP, nWhichV] == 1:
+                    Dzwind_and_power_dfSel[i] = 1
+                    nBadA = nBadA + 1
+
+                if BBoxRemove[nWhichP, nWhichV] == 2:
+                    Dzwind_and_power_dfSel[i] = 2
+
+                if BBoxRemove[nWhichP, nWhichV] == 3:
+                    Dzwind_and_power_dfSel[i] = 0  # 3  # 额定风速以上的超发功率点认为是正常点,不再标识。
+
+                if BBoxLimit[nWhichP, nWhichV] == 1 and nWhichP>16:
+                    Dzwind_and_power_dfSel[i] = 4
+
+        print("nWhichP", nWhichP)
+        print("nWhichV", nWhichV)
+        print("nBadA", nBadA)
+
+        # 限负荷数据标识方法2:把数据切割为若干个窗口。对每一窗口,以第一个点为基准,连续nWindowLength个数据的功率在方差范围内,呈现显著水平分布的点
+        PVLimit = np.zeros([nCounter1, 2], dtype=int)  # 存储限负荷数据
+        nLimitTotal = 0
+        nWindowLength = 3
+        LimitWindow = np.zeros(nWindowLength, dtype=int)
+        UpLimit = 0  # 上限
+        LowLimit = 0  # 下限
+        PowerStd = 15  # 功率波动方差
+        bAllInUpLow = 1  # ==1:窗口内所有数据均在方差上下限之内,限负荷==0,不满足条件
+        bAllInAreas = 1  # ==1:窗口所有数据均在200~PRated-300kW范围内;==0:不满足此条件
+        nWindowNum = int(np.floor(nCounter1 / nWindowLength))
+        PowerLimitUp = PRated - 300
+        PowerLimitLow = 200
+        for i in range(nWindowNum):
+            for j in range(nWindowLength):
+                LimitWindow[j] = DzMarch809[i * nWindowLength + j, 1]
+
+            bAllInAreas = 1
+            for j in range(nWindowLength):
+                if LimitWindow[j] < PowerLimitLow or LimitWindow[j] > PowerLimitUp:
+                    bAllInAreas = 0
+
+            if bAllInAreas == 0:
+                continue
+
+            UpLimit = LimitWindow[0] + PowerStd
+            LowLimit = LimitWindow[0] - PowerStd
+            bAllInUpLow = 1
+            for j in range(1, nWindowLength):
+                if LimitWindow[j] < LowLimit or LimitWindow[j] > UpLimit:
+                    bAllInUpLow = 0
+
+            if bAllInUpLow == 1:
+                for j in range(nWindowLength):
+                    Dzwind_and_power_dfSel[i * nWindowLength + j] = 4  # 标识窗口内的数据为限负荷数据
+
+                for j in range(nWindowLength):
+                    PVLimit[nLimitTotal, :] = DzMarch809[i * nWindowLength + j, :]
+                    nLimitTotal = nLimitTotal + 1
+
+        print("nLimitTotal", nLimitTotal)
+
+        # 相邻水平功率主带的锯齿平滑
+        PVLeftDown = np.zeros(2, dtype=int)
+        PVRightUp = np.zeros(2, dtype=int)
+        nSmooth = 0
+        for i in range(PNum - 6 - 1):
+            PVLeftDown = np.zeros(2, dtype=int)
+            PVRightUp = np.zeros(2, dtype=int)
+
+            if (PBoxMaxIndex[i + 1] - PBoxMaxIndex[i]) >= 1:
+                PVLeftDown[0] = (PBoxMaxIndex[i] + CurveWidthR) * 0.25 - 0.125
+                PVLeftDown[1] = (i - 1) * 25
+
+                PVRightUp[0] = (PBoxMaxIndex[i + 1] + CurveWidthR) * 0.25 - 0.125
+                PVRightUp[1] = (i + 1 - 1) * 25
+
+                for m in range(nCounter1):
+                    if DzMarch809[m, 0] > PVLeftDown[0] and DzMarch809[m, 0] < PVRightUp[0] and PVLeftDown[1] < \
+                            DzMarch809[m, 1] < PVRightUp[1]:  # 在该锯齿中
+                        if (DzMarch809[m, 1] - PVLeftDown[1]) / (DzMarch809[m, 0] - PVLeftDown[0]) > (
+                                PVRightUp[1] - PVLeftDown[1]) / (
+                                PVRightUp[0] - PVLeftDown[0]):  # 斜率大于对角连线,则在锯齿左上三角形中,选中
+                            Dzwind_and_power_dfSel[m] = 0
+                            nSmooth = nSmooth + 1
+
+        print("nSmooth", nSmooth)
+
+        # 存储好点
+        nCounterPV = 0
+        PVDot = np.zeros([nCounter1, 2], dtype=int)
+        for i in range(nCounter1):
+            if Dzwind_and_power_dfSel[i] == 0:
+                PVDot[nCounterPV, :] = DzMarch809[i, :]
+                nCounterPV = nCounterPV + 1
+
+        nCounterVP = nCounterPV
+        print("nCounterVP", nCounterVP)
+
+        # 存储坏点
+        nCounterBad = 0
+        PVBad = np.zeros([nCounter1, 2], dtype=int)
+        for i in range(nCounter1):
+            if Dzwind_and_power_dfSel[i] == 1 or Dzwind_and_power_dfSel[i] == 2 or Dzwind_and_power_dfSel[i] == 3:
+                PVBad[nCounterBad, :] = DzMarch809[i, :]
+                nCounterBad = nCounterBad + 1
+
+        print("nCounterBad", nCounterBad)
+
+        # 用功率主带中的好点绘制实测功率曲
+        XBinNumber = np.ones(50, dtype=int)
+        PCurve = np.zeros([50, 2], dtype=int)
+        PCurve[:, 0] = [i / 2 for i in range(1, 51)]
+        XBinSum = np.zeros([50, 2], dtype=int)
+        nWhichBin = 0
+
+        for i in range(nCounterVP):
+            nWhichBin = 0
+
+            for b in range(50):
+                if PVDot[i, 0] > (b * 0.5 - 0.25) and PVDot[i, 0] <= (b * 0.5 + 0.25):
+                    nWhichBin = b
+                    break
+
+            if nWhichBin > 0:
+                XBinSum[nWhichBin, 0] = XBinSum[nWhichBin, 0] + PVDot[i, 0]  # wind speed
+                XBinSum[nWhichBin, 1] = XBinSum[nWhichBin, 1] + PVDot[i, 1]  # Power
+                XBinNumber[nWhichBin] = XBinNumber[nWhichBin] + 1
+
+        for b in range(50):
+            XBinNumber[b] = XBinNumber[b] - 1
+
+        for b in range(50):
+            if XBinNumber[b] > 0:
+                PCurve[b, 0] = XBinSum[b, 0] / XBinNumber[b]
+                PCurve[b, 1] = XBinSum[b, 1] / XBinNumber[b]
+
+        # 对额定风速以上的功率直接赋额定功率
+        VRatedNum = int(VRated / 0.5)
+        for m in range(VRatedNum, 50):
+            if PCurve[m, 1] == 0:
+                PCurve[m, 1] = PRated
+
+        # print("PCurve", PCurve)
+
+        # 绘制标准正则功率曲线,以0.5m/s标准为间隔
+        # 15m/s以上为额定功率,15m/s以下为计算得到
+        PCurveNorm = np.zeros([50, 2], dtype=int)
+        for i in range(30, 50):
+            PCurveNorm[i, 0] = i * 0.5
+            PCurveNorm[i, 1] = PRated
+
+        # 15m/s一下正则功率曲线
+        CurveData = np.zeros([30, 2], dtype=int)
+        for i in range(30):
+            CurveData[i, :] = PCurve[i, :]
+
+        CurveNorm = np.zeros([30, 2], dtype=int)
+        VSpeed = [i / 2 for i in range(1, 31)]
+
+        WhichBin = 0
+
+        K = 0
+        a = 0
+        for m in range(30):
+            K = 0
+            a = 0
+
+            for n in range(30):
+                if abs(CurveData[n, 0] - VSpeed[m]) < 0.1:
+                    WhichBin = n
+                    break
+
+            if WhichBin > 1:
+                if CurveData[WhichBin, 0] - CurveData[WhichBin - 1, 0] > 0:
+                    K = (CurveData[WhichBin, 1] - CurveData[WhichBin - 1, 1]) / (
+                            CurveData[WhichBin, 0] - CurveData[WhichBin - 1, 0])
+                    a = CurveData[WhichBin, 1] - K * CurveData[WhichBin, 0]
+
+            CurveNorm[m, 0] = VSpeed[m]
+            CurveNorm[m, 1] = a + K * VSpeed[m]
+
+        for i in range(30):
+            PCurveNorm[i, :] = CurveNorm[i, :]
+
+        # 子模块3:损失电量计算及发电性能评价
+        CC = len(PCurve[:, 0])
+        EPIdealTotal = 0
+        # 计算停机损失
+        EPLostStopTotal = 0
+        EPLost = 0
+
+        nWhichBin = 0
+        IdealPower = 0
+        nStopTotal = 0
+        for i in range(wind_and_power_df_count):
+            if wind_and_power_df.loc[i, self.active_power] <= 0:
+                nWhichBin = 0
+                for m in range(base_wind_and_power_count - 1):
+                    if wind_and_power_df.loc[i, self.wind_velocity] > base_wind_and_power_df.loc[
+                        m, self.rated_wind_speed] and wind_and_power_df.loc[i, self.wind_velocity] <= \
+                            base_wind_and_power_df.loc[
+                                m + 1, self.rated_wind_speed]:
+                        nWhichBin = m
+                        break
+
+                if nWhichBin > base_wind_and_power_count - 1 or nWhichBin == 0:
+                    continue
+
+                IdealPower = (wind_and_power_df.loc[i, self.wind_velocity] - base_wind_and_power_df.loc[
+                    nWhichBin, self.rated_wind_speed]) / (
+                                     base_wind_and_power_df.loc[nWhichBin + 1, self.rated_wind_speed] -
+                                     base_wind_and_power_df.loc[
+                                         nWhichBin, self.rated_wind_speed]) * (
+                                     base_wind_and_power_df.loc[nWhichBin + 1, self.rated_capacity]
+                                     - base_wind_and_power_df.loc[nWhichBin, self.rated_capacity]) \
+                             + base_wind_and_power_df.loc[nWhichBin, self.rated_capacity]
+
+                EPLost = IdealPower / 6
+                EPLostStopTotal = EPLostStopTotal + EPLost
+                nStopTotal = nStopTotal + 1
+
+        print("EPLost", EPLost)
+        print("nStopTotal", nStopTotal)
+        print("EPLostStopTotal", EPLostStopTotal)
+
+        nWhichP = 0
+        nWhichV = 0
+        nWhichBin = 0
+        IdealPower = 0
+
+        # 计算欠发损失,此欠发损失已不包括限电损失,限电点在前面已经从欠发点中去除。
+        EPLostBadTotal = 0
+        EPLost = 0
+
+        nBadTotal = 0
+
+        LostBadPercent = 0
+
+        EPOverTotal = 0
+        EPOver = 0
+        nOverTotal = 0
+
+        for i in range(nCounter1):
+            if Dzwind_and_power_dfSel[i] == 1:
+                nWhichBin = 0
+                for m in range(base_wind_and_power_count - 1):
+                    if DzMarch809[i, 0] > base_wind_and_power_df.loc[m, self.rated_wind_speed] \
+                            and DzMarch809[i, 0] <= base_wind_and_power_df.loc[m + 1, self.rated_wind_speed]:
+                        nWhichBin = m
+                        break
+
+                if nWhichBin > base_wind_and_power_count - 1 or nWhichBin == 0:
+                    continue
+
+                IdealPower = (DzMarch809[i, 0] - base_wind_and_power_df.loc[nWhichBin, self.rated_wind_speed]) / (
+                        base_wind_and_power_df.loc[nWhichBin + 1, self.rated_wind_speed] - base_wind_and_power_df.loc[
+                    nWhichBin, self.rated_wind_speed]) * (
+                                     base_wind_and_power_df.loc[nWhichBin + 1, self.rated_capacity] -
+                                     base_wind_and_power_df.loc[nWhichBin, self.rated_capacity]) + \
+                             base_wind_and_power_df.loc[nWhichBin, self.rated_capacity]
+                EPLost = abs(IdealPower - DzMarch809[i, 1]) / 6
+                EPLostBadTotal = EPLostBadTotal + EPLost
+                nBadTotal = nBadTotal + 1
+
+            # 额定风速以上超发电量
+            if Dzwind_and_power_dfSel[i] == 3:
+                EPOver = (DzMarch809[i, 1] - PRated) / 6
+                EPOverTotal = EPOverTotal + EPOver
+                nOverTotal = nOverTotal + 1
+
+        print("EPLost", EPLost)
+        print("nBadTotal", nBadTotal)
+        print("EPLostBadTotal", EPLostBadTotal)
+        print("EPOverTotal", EPOverTotal)
+        print("nOverTotal", nOverTotal)
+
+        # 功率曲线未达标损失
+        EPLostPerformTotal = 0
+        nWhichBinI = 0
+        IdealPower = 0
+
+        for i in range(nCounterVP):
+
+            for m in range(base_wind_and_power_count - 1):
+                if PVDot[i, 0] > base_wind_and_power_df.loc[m, self.rated_wind_speed] and PVDot[i, 0] <= \
+                        base_wind_and_power_df.loc[m + 1, self.rated_wind_speed]:
+                    nWhichBinI = m
+                    break
+
+            if nWhichBinI > base_wind_and_power_count - 1 or nWhichBinI == 0:
+                continue
+
+            IdealPower = (PVDot[i, 0] - base_wind_and_power_df.loc[nWhichBinI, self.rated_wind_speed]) / (
+                    base_wind_and_power_df.loc[nWhichBinI + 1, self.rated_wind_speed] - base_wind_and_power_df.loc[
+                nWhichBinI, self.rated_wind_speed]) * \
+                         (base_wind_and_power_df.loc[nWhichBinI + 1, self.rated_capacity] -
+                          base_wind_and_power_df.loc[nWhichBinI, self.rated_capacity]) + \
+                         base_wind_and_power_df.loc[nWhichBinI, self.rated_capacity]
+
+            EPLostPerformTotal = EPLostPerformTotal + (IdealPower - PVDot[i, 1]) / 6
+
+        print("EPLostPerformTotal", EPLostPerformTotal)
+
+        # 限电损失
+        EPLostLimitTotal = 0
+        EPLost = 0
+        nLimitTotal = 0
+
+        PVLimit = np.zeros([nCounter1, 2])
+
+        for i in range(nCounter1):
+            if Dzwind_and_power_dfSel[i] == 4:
+                nWhichBin = 0
+                for m in range(base_wind_and_power_count - 1):
+                    if DzMarch809[i, 0] > base_wind_and_power_df.loc[m, self.rated_wind_speed] and DzMarch809[i, 0] <= \
+                            base_wind_and_power_df.loc[m + 1, self.rated_wind_speed]:
+                        nWhichBin = m
+                        break
+
+                # 插值计算对应设计功率
+                if nWhichBin > base_wind_and_power_count - 1 or nWhichBin == 0:
+                    continue
+
+                IdealPower = (DzMarch809[i, 0] - base_wind_and_power_df.loc[nWhichBin, self.rated_wind_speed]) / (
+                        base_wind_and_power_df.loc[nWhichBin + 1, self.rated_wind_speed] -
+                        base_wind_and_power_df.loc[nWhichBin, self.rated_wind_speed]) * (
+                                     base_wind_and_power_df.loc[nWhichBin + 1, self.rated_capacity] -
+                                     base_wind_and_power_df.loc[nWhichBin, self.rated_capacity]) + \
+                             base_wind_and_power_df.loc[nWhichBin, self.rated_capacity]
+                EPLost = np.abs(IdealPower - DzMarch809[i, 1]) / 6
+                EPLostLimitTotal = EPLostLimitTotal + EPLost
+
+                PVLimit[nLimitTotal, :] = DzMarch809[i, :]
+                nLimitTotal = nLimitTotal + 1
+
+        nLimitTotal = nLimitTotal - 1
+
+        print("nLimitTotal", nLimitTotal)
+
+        # 欠发和限点损失总和
+        EPLostBadLimitTotal = EPLostBadTotal + EPLostLimitTotal
+
+        # 如果功率曲线未达标损失为正
+        if EPLostPerformTotal >= 0:
+            EPIdealTotal = EPActualTotal + EPLostStopTotal + EPLostLimitTotal + EPLostBadTotal + EPLostPerformTotal
+
+        # 如果功率曲线未达标损失为负
+        if EPLostPerformTotal < 0:
+            EPIdealTotal = EPActualTotal + EPLostStopTotal + EPLostLimitTotal + EPLostBadTotal
+
+        print("EPIdealTotal", EPIdealTotal)
+        # 可以比较求和得到的应发功率EPIdealTotal与理论计算得到的应发功率EPIdealTotalAAA的差别
+        # 需要去除的超发功率:(1)功率主带左侧的超发点;(2)额定风速以上的超发点。
+        RemoveOverEP = 0
+        nType2 = 0
+        for i in range(nCounter1):
+            if Dzwind_and_power_dfSel[i] == 2:  # 功率主带左侧的超发坏点
+                nWhichBin = 0
+                for m in range(base_wind_and_power_count - 1):
+                    if base_wind_and_power_df.loc[m, self.rated_wind_speed] < DzMarch809[i, 0] <= base_wind_and_power_df.loc[m + 1, self.rated_wind_speed]:
+                        nWhichBin = m
+                        break
+
+                if nWhichBin > base_wind_and_power_count - 1 or nWhichBin == 0:
+                    continue
+
+                IdealPower = (DzMarch809[i, 0] - base_wind_and_power_df.loc[nWhichBin, self.rated_wind_speed]) / (
+                        base_wind_and_power_df.loc[nWhichBin + 1, self.rated_wind_speed] - base_wind_and_power_df.loc[
+                    nWhichBin, self.rated_wind_speed]) * (
+                                     base_wind_and_power_df.loc[nWhichBin + 1, self.rated_capacity] -
+                                     base_wind_and_power_df.loc[nWhichBin, self.rated_capacity]) + \
+                             base_wind_and_power_df.loc[nWhichBin, self.rated_capacity]
+
+                RemoveOverEP = RemoveOverEP + (DzMarch809[i, 1] - IdealPower) / 6
+                nType2 = nType2 + 1
+
+        print("RemoveOverEP", RemoveOverEP)
+        print("nType2", nType2)
+        # 额定功率以上的超发点
+        nTypeOver = 0
+        for i in range(nCounter1):
+            if DzMarch809[i, 1] > PRated:
+                RemoveOverEP = RemoveOverEP + (DzMarch809[i, 1] - PRated) / 6
+                nTypeOver = nTypeOver + 1
+
+        print("RemoveOverEP", RemoveOverEP)
+        print("nTypeOver", nTypeOver)
+
+    def run(self):
+        # Implement your class identification logic here
+        self.identifier()
+
+
+if __name__ == '__main__':
+    test = ClassIdentifier('test', r"D:\中能智能\matlib计算相关\好点坏点matlib计算\A01.csv", index='时间',
+                           wind_velocity='风速',
+                           active_power='功率')
+
+    test.run()

+ 196 - 0
tmp_file/baiyushan_20240906.py

@@ -0,0 +1,196 @@
+from multiprocessing import Pool
+from os import *
+
+import chardet
+import pandas as pd
+
+
+# 获取文件编码
+def detect_file_encoding(filename):
+    # 读取文件的前1000个字节(足够用于大多数编码检测)
+    with open(filename, 'rb') as f:
+        rawdata = f.read(1000)
+    result = chardet.detect(rawdata)
+    encoding = result['encoding']
+
+    if encoding is None:
+        encoding = 'gb18030'
+
+    if encoding and encoding.lower() == 'gb2312' or encoding.lower().startswith("windows"):
+        encoding = 'gb18030'
+    return encoding
+
+
+# 读取数据到df
+def read_file_to_df(file_path, read_cols=list(), header=0):
+    df = pd.DataFrame()
+    if str(file_path).lower().endswith("csv") or str(file_path).lower().endswith("gz"):
+        encoding = detect_file_encoding(file_path)
+        end_with_gz = str(file_path).lower().endswith("gz")
+        if read_cols:
+            if end_with_gz:
+                df = pd.read_csv(file_path, encoding=encoding, usecols=read_cols, compression='gzip', header=header)
+            else:
+                df = pd.read_csv(file_path, encoding=encoding, usecols=read_cols, header=header, on_bad_lines='warn')
+        else:
+
+            if end_with_gz:
+                df = pd.read_csv(file_path, encoding=encoding, compression='gzip', header=header)
+            else:
+                df = pd.read_csv(file_path, encoding=encoding, header=header, on_bad_lines='warn')
+
+    else:
+        xls = pd.ExcelFile(file_path)
+        # 获取所有的sheet名称
+        sheet_names = xls.sheet_names
+        for sheet in sheet_names:
+            if read_cols:
+                df = pd.concat([df, pd.read_excel(xls, sheet_name=sheet, header=header, usecols=read_cols)])
+            else:
+                df = pd.concat([df, pd.read_excel(xls, sheet_name=sheet, header=header)])
+
+    return df
+
+
+def __build_directory_dict(directory_dict, path, filter_types=None):
+    # 遍历目录下的所有项
+    for item in listdir(path):
+        item_path = path.join(path, item)
+        if path.isdir(item_path):
+            __build_directory_dict(directory_dict, item_path, filter_types=filter_types)
+        elif path.isfile(item_path):
+            if path not in directory_dict:
+                directory_dict[path] = []
+
+            if filter_types is None or len(filter_types) == 0:
+                directory_dict[path].append(item_path)
+            elif str(item_path).split(".")[-1] in filter_types:
+                if str(item_path).count("~$") == 0:
+                    directory_dict[path].append(item_path)
+
+    # 读取所有文件
+
+
+# 读取路径下所有的excel文件
+def read_excel_files(read_path):
+    directory_dict = {}
+    __build_directory_dict(directory_dict, read_path, filter_types=['xls', 'xlsx', 'csv', 'gz'])
+
+    return [path for paths in directory_dict.values() for path in paths if path]
+
+
+# 创建路径
+def create_file_path(path, is_file_path=False):
+    if is_file_path:
+        path = path.dirname(path)
+
+    if not path.exists(path):
+        makedirs(path, exist_ok=True)
+
+
+def read_status(status_path):
+    all_files = read_excel_files(status_path)
+
+    with Pool(20) as pool:
+        dfs = pool.starmap(read_file_to_df, [(file, ['设备名称', '状态码', '开始时间'], 2) for file in all_files])
+
+    df = pd.concat(dfs)
+    df = df[df['状态码'].isin([3, 5])]
+    df['开始时间'] = pd.to_datetime(df['开始时间'])
+
+    df['处理后时间'] = (df['开始时间'] + pd.Timedelta(minutes=10)).apply(
+        lambda x: f"{x.year}-{str(x.month).zfill(2)}-{str(x.day).zfill(2)} {str(x.hour).zfill(2)}:{x.minute // 10}0:00")
+
+    df['处理后时间'] = pd.to_datetime(df['处理后时间'])
+    df = df[(df['处理后时间'] >= '2023-09-01 00:00:00')]
+    df[df['处理后时间'] >= '2024-09-01 00:00:00'] = '2024-09-01 00:00:00'
+    df.sort_values(by=['设备名称', '处理后时间'], inplace=True)
+
+    return df
+
+
+def read_fault_data(fault_path):
+    all_files = read_excel_files(fault_path)
+
+    with Pool(20) as pool:
+        dfs = pool.starmap(read_file_to_df, [(file, ['设备名称', '故障开始时间'], 2) for file in all_files])
+
+    df = pd.concat(dfs)
+    df = df[df['设备名称'].str.startswith("#")]
+    df['故障开始时间'] = pd.to_datetime(df['故障开始时间'])
+
+    df['处理后故障开始时间'] = (df['故障开始时间'] + pd.Timedelta(minutes=10)).apply(
+        lambda x: f"{x.year}-{str(x.month).zfill(2)}-{str(x.day).zfill(2)} {str(x.hour).zfill(2)}:{x.minute // 10}0:00")
+
+    df['处理后故障开始时间'] = pd.to_datetime(df['处理后故障开始时间'])
+    df = df[(df['处理后故障开始时间'] >= '2023-09-01 00:00:00') & (df['处理后故障开始时间'] < '2024-09-01 00:00:00')]
+    df.sort_values(by=['设备名称', '处理后故障开始时间'], inplace=True)
+
+    return df
+
+
+def read_10min_data(data_path):
+    all_files = read_excel_files(data_path)
+
+    with Pool(20) as pool:
+        dfs = pool.starmap(read_file_to_df,
+                           [(file, ['设备名称', '时间', '平均风速(m/s)', '平均网侧有功功率(kW)'], 1) for file in all_files])
+
+    df = pd.concat(dfs)
+    df['时间'] = pd.to_datetime(df['时间'])
+
+    df = df[(df['时间'] >= '2023-09-01 00:00:00') & (df['时间'] < '2024-09-01 00:00:00')]
+    df.sort_values(by=['设备名称', '时间'], inplace=True)
+    return df
+
+
+def select_data_and_save(name, fault_df, origin_df):
+    df = pd.DataFrame()
+    for i in range(fault_df.shape[0]):
+        fault = fault_df.iloc[i]
+        con1 = origin_df['时间'] >= fault['处理后故障开始时间']
+        con2 = origin_df['时间'] <= fault['结束时间']
+        df = pd.concat([df, origin_df[con1 & con2]])
+
+    name = name.replace('#', 'F')
+    df.drop_duplicates(inplace=True)
+    df.to_csv(save_path + sep + name + '.csv', index=False, encoding='utf8')
+
+
+if __name__ == '__main__':
+    base_path = r'/data/download/白玉山/需要整理的数据'
+    save_path = base_path + sep + 'sele_data_202409261135'
+    create_file_path(save_path)
+    status_df = read_status(base_path + sep + '设备状态')
+    fault_df = read_fault_data(base_path + sep + '故障')
+    data_df = read_10min_data(base_path + sep + '十分钟')
+
+    status_df.to_csv(base_path + sep + '设备状态' + '.csv', index=False, encoding='utf8')
+    fault_df.to_csv(base_path + sep + '故障' + '.csv', index=False, encoding='utf8')
+    data_df.to_csv(base_path + sep + '十分钟' + '.csv', index=False, encoding='utf8')
+
+    print(status_df.shape)
+    print(fault_df.shape)
+    print(data_df.shape)
+
+    fault_list = list()
+    for i in range(fault_df.shape[0]):
+        data = fault_df.iloc[i]
+        con1 = status_df['设备名称'] == data['设备名称']
+        con2 = status_df['处理后时间'] >= data['处理后故障开始时间']
+        fault_list.append(status_df[con1 & con2]['处理后时间'].min())
+    fault_df['结束时间'] = fault_list
+
+    status_df.to_csv(base_path + sep + '设备状态' + '.csv', index=False, encoding='utf8')
+    fault_df.to_csv(base_path + sep + '故障' + '.csv', index=False, encoding='utf8')
+    data_df.to_csv(base_path + sep + '十分钟' + '.csv', index=False, encoding='utf8')
+
+    names = set(fault_df['设备名称'])
+    fault_map = dict()
+    data_map = dict()
+    for name in names:
+        fault_map[name] = fault_df[fault_df['设备名称'] == name]
+        data_map[name] = data_df[data_df['设备名称'] == name]
+
+    with Pool(20) as pool:
+        pool.starmap(select_data_and_save, [(name, fault_map[name], data_map[name]) for name in names])

+ 48 - 0
tmp_file/changing_hebing_guzhang.py

@@ -0,0 +1,48 @@
+import copy
+import datetime
+
+import pandas as pd
+
+read_path = r'D:\data\长清\故障记录_20230420_20240419.csv'
+
+df = pd.read_csv(read_path, encoding='gb18030')
+
+df['风机名'] = df['风机名'].apply(lambda wind_name: 'A' + wind_name.replace('号风机', '').zfill(2))
+
+df = df[~df['状态码描述'].isin(['高偏航误差穿越', '手动偏航'])]
+
+df['激活时间'] = pd.to_datetime(df['激活时间'].apply(lambda x: x[0:x.rfind(":")]), errors='coerce')
+df['复位时间'] = pd.to_datetime(df['复位时间'].apply(lambda x: x[0:x.rfind(":")]), errors='coerce')
+
+df.dropna(subset=['激活时间', '复位时间'], inplace=True)
+
+
+def generate_next_10_min(dt):
+    minute = dt.minute
+    chazhi = 10 - int(minute % 10)
+    now = dt + datetime.timedelta(minutes=chazhi)
+    now = now.replace(second=0, microsecond=0)
+
+    return now
+
+
+df['begin_time'] = df['激活时间'].apply(generate_next_10_min)
+df['end_time'] = df['复位时间'].apply(generate_next_10_min)
+
+df['chazhi_count'] = ((df['end_time'] - df['begin_time']).dt.seconds) // 600 + 1
+
+result_df = df[df['chazhi_count'] == 1]
+
+datas = [[]]
+for index, row in df[df['chazhi_count'] > 1].iterrows():
+    for i in range(row['chazhi_count']):
+        data = copy.deepcopy(row.values)
+        data[6] = data[6] + datetime.timedelta(minutes=10 * i)
+        datas.append(data)
+
+now_df = pd.DataFrame(datas, columns=df.columns)
+result_df = pd.concat([result_df, now_df])
+
+result_df.reset_index(inplace=True, drop=True)
+result_df.sort_values(by=['风机名', '激活时间', 'begin_time'], inplace=True)
+result_df.to_csv("故障记录.csv", encoding='utf8')

+ 95 - 0
tmp_file/cp_file.py

@@ -0,0 +1,95 @@
+import datetime
+import multiprocessing
+import os
+import shutil
+
+not_move_dir = ["乌梅山风电场-江西-大唐",
+                "诺木洪风电场-甘肃-华电",
+                "平陆风电场-山西-中广核",
+                "泗洪协合风电场-安徽-深能南控",
+                "诺木洪风电场-青海-华电",
+                "长清风电场-山东-国电"
+                ]
+
+read_dir = r"/data/download/collection_data"
+# read_dir = r'Z:\collection_data'
+save_base_dir = r"/data/download/datang_shangxian"
+
+
+def __build_directory_dict(directory_dict, path, filter_types=None):
+    # 遍历目录下的所有项
+    for item in os.listdir(path):
+        if item not in not_move_dir:
+            item_path = os.path.join(path, item)
+            if os.path.isdir(item_path):
+                __build_directory_dict(directory_dict, item_path, filter_types=filter_types)
+            elif os.path.isfile(item_path):
+                if path not in directory_dict:
+                    directory_dict[path] = []
+
+                if filter_types is None or len(filter_types) == 0:
+                    directory_dict[path].append(item_path)
+                elif str(item_path).split(".")[-1] in filter_types:
+                    if str(item_path).count("~$") == 0:
+                        directory_dict[path].append(item_path)
+
+
+# 读取路径下所有的excel文件
+def read_excel_files(read_path):
+    if os.path.isfile(read_path):
+        return [read_path]
+
+    directory_dict = {}
+    __build_directory_dict(directory_dict, read_path, filter_types=['xls', 'xlsx', 'csv', 'gz'])
+
+    return [path for paths in directory_dict.values() for path in paths if path]
+
+
+# 读取路径下所有的文件
+def read_files(read_path):
+    if os.path.isfile(read_path):
+        return [read_path]
+    directory_dict = {}
+    __build_directory_dict(directory_dict, read_path, filter_types=['xls', 'xlsx', 'csv', 'gz', 'zip', 'rar'])
+
+    return [path for paths in directory_dict.values() for path in paths if path]
+
+
+# 创建路径
+def create_file_path(path, is_file_path=False):
+    """
+    创建路径
+    :param path:创建文件夹的路径
+    :param is_file_path: 传入的path是否包含具体的文件名
+    """
+    if is_file_path:
+        path = os.path.dirname(path)
+
+    if not os.path.exists(path):
+        os.makedirs(path, exist_ok=True)
+
+
+def copy_to_new(from_path):
+    to_path = from_path.replace(read_dir, save_base_dir)
+    is_file = False
+    if to_path.count('.') > 0:
+        is_file = True
+
+    create_file_path(to_path, is_file_path=is_file)
+
+    shutil.copy(from_path, to_path)
+
+
+print("开始:", datetime.datetime.now())
+begin = datetime.datetime.now()
+read_all_files = [i for i in read_files(read_dir) if i.find("收资数据") > -1]
+print(len(read_all_files))
+print("统计耗时:", datetime.datetime.now() - begin)
+cp_begin = datetime.datetime.now()
+
+with multiprocessing.Pool(40) as pool:
+    pool.starmap(copy_to_new, [(path,) for path in read_all_files])
+
+print(len(read_all_files), "耗时:", datetime.datetime.now() - cp_begin, "总耗时:", datetime.datetime.now() - begin)
+print("结束:", datetime.datetime.now())
+

+ 94 - 0
tmp_file/cp_online_data_to_other.py

@@ -0,0 +1,94 @@
+import datetime
+import multiprocessing
+import shutil
+from os import *
+
+not_move_dir = ["乌梅山风电场-江西-大唐",
+                "诺木洪风电场-甘肃-华电",
+                "平陆风电场-山西-中广核",
+                "泗洪协合风电场-安徽-深能南控",
+                "诺木洪风电场-青海-华电",
+                "长清风电场-山东-国电"
+                ]
+
+read_dir = r"/data/download/collection_data"
+# read_dir = r'Z:\collection_data'
+save_base_dir = r"/data/download/datang_shangxian"
+
+
+def __build_directory_dict(directory_dict, path, filter_types=None):
+    # 遍历目录下的所有项
+    for item in listdir(path):
+        if item not in not_move_dir:
+            item_path = path.join(path, item)
+            if path.isdir(item_path):
+                __build_directory_dict(directory_dict, item_path, filter_types=filter_types)
+            elif path.isfile(item_path):
+                if path not in directory_dict:
+                    directory_dict[path] = []
+
+                if filter_types is None or len(filter_types) == 0:
+                    directory_dict[path].append(item_path)
+                elif str(item_path).split(".")[-1] in filter_types:
+                    if str(item_path).count("~$") == 0:
+                        directory_dict[path].append(item_path)
+
+
+# 读取路径下所有的excel文件
+def read_excel_files(read_path):
+    if path.isfile(read_path):
+        return [read_path]
+
+    directory_dict = {}
+    __build_directory_dict(directory_dict, read_path, filter_types=['xls', 'xlsx', 'csv', 'gz'])
+
+    return [path for paths in directory_dict.values() for path in paths if path]
+
+
+# 读取路径下所有的文件
+def read_files(read_path):
+    if path.isfile(read_path):
+        return [read_path]
+    directory_dict = {}
+    __build_directory_dict(directory_dict, read_path, filter_types=['xls', 'xlsx', 'csv', 'gz', 'zip', 'rar'])
+
+    return [path for paths in directory_dict.values() for path in paths if path]
+
+
+# 创建路径
+def create_file_path(path, is_file_path=False):
+    """
+    创建路径
+    :param path:创建文件夹的路径
+    :param is_file_path: 传入的path是否包含具体的文件名
+    """
+    if is_file_path:
+        path = path.dirname(path)
+
+    if not path.exists(path):
+        makedirs(path, exist_ok=True)
+
+
+def copy_to_new(from_path):
+    to_path = from_path.replace(read_dir, save_base_dir)
+    is_file = False
+    if to_path.count('.') > 0:
+        is_file = True
+
+    create_file_path(to_path, is_file_path=is_file)
+
+    shutil.copy(from_path, to_path)
+
+
+print("开始:", datetime.datetime.now())
+begin = datetime.datetime.now()
+read_all_files = [i for i in read_files(read_dir) if i.find("收资数据") > -1]
+print(len(read_all_files))
+print("统计耗时:", datetime.datetime.now() - begin)
+cp_begin = datetime.datetime.now()
+
+with multiprocessing.Pool(40) as pool:
+    pool.starmap(copy_to_new, [(path,) for path in read_all_files])
+
+print(len(read_all_files), "耗时:", datetime.datetime.now() - cp_begin, "总耗时:", datetime.datetime.now() - begin)
+print("结束:", datetime.datetime.now())

+ 47 - 0
tmp_file/curge_read.py

@@ -0,0 +1,47 @@
+import os
+
+import chardet
+import pandas as pd
+
+
+# 获取文件编码
+def detect_file_encoding(filename):
+    # 读取文件的前1000个字节(足够用于大多数编码检测)
+    with open(filename, 'rb') as f:
+        rawdata = f.read(1000)
+    result = chardet.detect(rawdata)
+    encoding = result['encoding']
+
+    if encoding is None:
+        encoding = 'gb18030'
+
+    if encoding.lower() in ['utf-8', 'ascii', 'utf8', 'utf-8-sig']:
+        return 'utf-8'
+
+    return 'gb18030'
+
+
+def read_file_to_df(file_path, nrows=None):
+    df = pd.DataFrame()
+    try:
+        if str(file_path).lower().endswith("csv"):
+            encoding = detect_file_encoding(file_path)
+            df = pd.read_csv(file_path, encoding=encoding, on_bad_lines='warn', nrows=nrows)
+        else:
+            xls = pd.ExcelFile(file_path)
+            sheet_names = xls.sheet_names
+            for sheet_name in sheet_names:
+                now_df = pd.read_excel(xls, sheet_name=sheet_name, nrows=nrows)
+                now_df['sheet_name'] = sheet_name
+                df = pd.concat([df, now_df])
+            xls.close()
+    except Exception as e:
+        message = '文件:' + os.path.basename(file_path) + ',' + str(e)
+        raise ValueError(message)
+
+    return df
+
+
+if __name__ == '__main__':
+    df = read_file_to_df(r"D:\data\11-12月.xls")
+    print(df)

+ 40 - 0
tmp_file/error_ms_data.py

@@ -0,0 +1,40 @@
+from datetime import datetime
+
+import pandas as pd
+
+
+def convert_date(date_str):
+    cut_index = str(date_str).rfind("_")
+    date = date_str[0:cut_index].replace("_", "-")
+    time = date_str[cut_index + 1:].replace(":", ".")
+
+    return datetime.strptime(f"{date} {time}", '%Y-%m-%d %H.%M.%S.%f')
+
+
+df = pd.read_csv(r"d:/data/b2_240828_2324_Err 1.csv", header=1)
+df.dropna(subset='TimeStamp', inplace=True)
+df.drop_duplicates(subset='TimeStamp', keep="first", inplace=True)
+
+origin_columns = list(df.columns)
+
+df['TimeStamp1'] = df['TimeStamp'].apply(convert_date)
+df.sort_values(by='TimeStamp1', inplace=True)
+
+# df['DateTime'] = pd.to_datetime(df['TimeStamp'], format="%Y-%m-%d %H:%M:%S")
+df['DateTime'] = df['TimeStamp1'].apply(lambda x: x.strftime("%Y-%m-%d %H:%M:%S"))
+
+print(df.shape)
+
+dateTime_count = df['DateTime'].value_counts()
+
+dateTime_count_1 = dateTime_count[dateTime_count == 1]
+dateTime_count_gt1 = dateTime_count[dateTime_count > 1]
+
+df1 = df[df['DateTime'].isin(dateTime_count_1.index.values)]
+df2 = df[df['DateTime'].isin(dateTime_count_gt1.index.values)]
+
+print(df1.shape)
+print(df2.shape)
+origin_columns.insert(0, 'DateTime')
+df1.to_csv("1秒数据.csv", encoding='utf-8', index=False, columns=origin_columns, date_format="%Y-%m-%d %H:%M:%S.%f")
+df2.to_csv("毫秒数据.csv", encoding='utf-8', index=False, columns=origin_columns, date_format="%Y-%m-%d %H:%M:%S.%f")

+ 120 - 0
tmp_file/extrace_month_data.py

@@ -0,0 +1,120 @@
+import multiprocessing
+import os
+from datetime import datetime
+
+import chardet
+import pandas as pd
+
+pd.options.mode.copy_on_write = True
+
+
+# 获取文件编码
+def detect_file_encoding(filename):
+    # 读取文件的前1000个字节(足够用于大多数编码检测)
+    with open(filename, 'rb') as f:
+        rawdata = f.read(1000)
+    result = chardet.detect(rawdata)
+    encoding = result['encoding']
+
+    if encoding is None:
+        encoding = 'gb18030'
+
+    if encoding and encoding.lower() == 'gb2312' or encoding.lower().startswith("windows"):
+        encoding = 'gb18030'
+    return encoding
+
+
+# 读取数据到df
+def read_file_to_df(file_path, read_cols=list(), header=0):
+    df = pd.DataFrame()
+    if str(file_path).lower().endswith("csv") or str(file_path).lower().endswith("gz"):
+        encoding = detect_file_encoding(file_path)
+        end_with_gz = str(file_path).lower().endswith("gz")
+        if read_cols:
+            if end_with_gz:
+                df = pd.read_csv(file_path, encoding=encoding, usecols=read_cols, compression='gzip', header=header)
+            else:
+                df = pd.read_csv(file_path, encoding=encoding, usecols=read_cols, header=header, on_bad_lines='warn')
+        else:
+
+            if end_with_gz:
+                df = pd.read_csv(file_path, encoding=encoding, compression='gzip', header=header)
+            else:
+                df = pd.read_csv(file_path, encoding=encoding, header=header, on_bad_lines='warn')
+
+    else:
+        xls = pd.ExcelFile(file_path)
+        # 获取所有的sheet名称
+        sheet_names = xls.sheet_names
+        for sheet in sheet_names:
+            if read_cols:
+                df = pd.concat([df, pd.read_excel(xls, sheet_name=sheet, header=header, usecols=read_cols)])
+            else:
+                df = pd.concat([df, pd.read_excel(xls, sheet_name=sheet, header=header)])
+
+    return df
+
+
+def __build_directory_dict(directory_dict, path, filter_types=None):
+    # 遍历目录下的所有项
+    for item in os.listdir(path):
+        item_path = os.path.join(path, item)
+        if os.path.isdir(item_path):
+            __build_directory_dict(directory_dict, item_path, filter_types=filter_types)
+        elif os.path.isfile(item_path):
+            if path not in directory_dict:
+                directory_dict[path] = []
+
+            if filter_types is None or len(filter_types) == 0:
+                directory_dict[path].append(item_path)
+            elif str(item_path).split(".")[-1] in filter_types:
+                if str(item_path).count("~$") == 0:
+                    directory_dict[path].append(item_path)
+
+
+def read_excel_files(read_path):
+    directory_dict = {}
+    __build_directory_dict(directory_dict, read_path, filter_types=['xls', 'xlsx', 'csv', 'gz'])
+
+    return [path for paths in directory_dict.values() for path in paths if path]
+
+
+def read_and_save(file_path, read_dir, save_dir, time_col, time_during, select_cols_str):
+    base_name = os.path.basename(file_path)
+    print(f'{datetime.now()}: {base_name} 开始执行')
+    begin = datetime.now()
+    select_cols = list()
+    if select_cols_str:
+        select_cols = [i for i in select_cols_str.split(",") if i]
+
+    if select_cols:
+        if time_col not in select_cols:
+            select_cols.insert(0, time_col)
+        df = read_file_to_df(file_path, read_cols=select_cols)
+    else:
+        df = read_file_to_df(file_path)
+
+    print(f'{base_name} : {df.shape}')
+    df[time_col] = pd.to_datetime(df[time_col], errors='coerce')
+    df = df[(df[time_col] >= datetime.strptime(time_during.split(",")[0], '%Y-%m-%d %H:%M:%S')) &
+            (df[time_col] < datetime.strptime(time_during.split(",")[1], '%Y-%m-%d %H:%M:%S'))]
+
+    save_path = file_path.replace(read_dir, save_dir)
+    os.makedirs(os.path.dirname(save_path), exist_ok=True)
+    df.to_csv(save_path, index=False, encoding='utf-8')
+    print(f"{datetime.now()}: 执行{os.path.basename(file_path)}结束,耗时{datetime.now() - begin}")
+
+
+if __name__ == '__main__':
+    read_dir = r'/data/download/collection_data/1进行中/新艾里风电场-吉林-大唐/清理数据/WOF043600007-WOB000004_XAL1219秒级/second'
+    save_dir = r'/data/download/collection_data/1进行中/新艾里风电场-吉林-大唐/清理数据/WOF043600007-WOB000004_XAL1219秒级/8-9-10-month-data'
+    time_col = 'time_stamp'
+    time_during = '2024-08-01 00:00:00,2024-11-01 00:00:00'
+    select_cols_str = 'time_stamp,active_power,wind_velocity,pitch_angle_blade_1,yaw_error1'
+
+    all_excel_files = read_excel_files(read_dir)
+
+    with multiprocessing.Pool(processes=10) as pool:
+        pool.starmap(read_and_save,
+                     [(file_path, read_dir, save_dir, time_col, time_during, select_cols_str) for file_path in
+                      all_excel_files])

+ 57 - 0
tmp_file/fengxiang_fengdianchang.py

@@ -0,0 +1,57 @@
+import sys
+from multiprocessing import Pool
+from os import path
+path = path.dirname(path.dirname(path.abspath(__file__)))
+print(path)
+sys.path.insert(0, path)
+print(sys.path)
+
+from utils.file.trans_methods import *
+from utils.systeminfo.sysinfo import use_files_get_max_cpu_count
+
+
+def read_and_save_file(filename):
+    try:
+        basename = path.basename(filename)
+        wind_number = basename.split("_")[0]
+        df = read_file_to_df(filename, header=1)
+        df['风机号'] = wind_number
+        df['描述'] = pd.to_datetime(df['描述'], format='%d-%m-%Y %H:%M:%S')
+        df.set_index(keys=['描述', '风机号'], inplace=True)
+        return wind_number, df
+    except Exception as e:
+        print(basename, 'error')
+        raise e
+
+
+if __name__ == '__main__':
+    read_path = r'/data/download/collection_data/1进行中/枫香风电场-贵州-大唐/收资数据/枫香风电场收资表/1.10分钟SCADA数据'
+    save_path = r'/data/download/collection_data/1进行中/枫香风电场-贵州-大唐/清理数据/枫香风电场收资表/1.10分钟SCADA数据'
+    # read_path = r'D:\trans_data\枫香\收资数据\min'
+    # save_path = r'D:\trans_data\枫香\清理数据\min'
+    create_file_path(save_path, False)
+    all_fils = read_excel_files(read_path)
+    process_count = use_files_get_max_cpu_count(all_fils)
+
+    with Pool(process_count) as pool:
+        results = pool.starmap(read_and_save_file, [(i,) for i in all_fils])
+
+    df_dict = dict()
+    for result in results:
+        wind_number, df = result
+        cols = list(df.columns)
+        cols.sort()
+        cols_str = '-'.join(cols)
+        if wind_number in df_dict.keys():
+            if cols_str in df_dict[wind_number].keys():
+                df_dict[wind_number][cols_str] = pd.concat([df_dict[wind_number][cols_str], df], axis=0)
+            else:
+                df_dict[wind_number][cols_str] = df
+        else:
+            df_dict[wind_number] = {cols_str: df}
+
+    for wind_number, cols_dict in df_dict.items():
+        df = pd.concat(cols_dict.values(), axis=1)
+        df.sort_index(inplace=True)
+        df.reset_index(inplace=True)
+        df.to_csv(path.join(save_path, f"{wind_number}.csv"), encoding="utf-8", index=False)

+ 48 - 0
tmp_file/filter_lose_data.py

@@ -0,0 +1,48 @@
+import datetime
+
+import pandas as pd
+
+df = pd.read_csv("D:\data\白玉山后评估数据资料\十分钟.csv", encoding='utf8')
+
+df['时间'] = pd.to_datetime(df['时间'])
+df['plus_10min'] = df['时间'] + pd.Timedelta(minutes=10)
+
+names = set(df['设备名称'])
+
+
+def get_time_space_count(start_time: datetime.datetime, end_time: datetime.datetime, time_space=1):
+    """
+    获取俩个时间之间的个数
+    :return: 查询时间间隔
+    """
+    delta = end_time - start_time
+    total_seconds = delta.days * 24 * 60 * 60 + delta.seconds
+
+    return abs(int(total_seconds / time_space))
+
+
+result_dict = dict()
+for name in names:
+    q_df = df[df['设备名称'] == name]
+    q_df['unshift'] = q_df['时间'].shift(-1)
+    q_df.fillna('2024-09-01 00:00:00', inplace=True)
+    result_df = q_df[~(q_df['plus_10min'] == q_df['unshift'])]
+    result_df.reset_index(inplace=True)
+    q_list = list()
+    count = 0
+    result_df.to_csv('test.csv', encoding='utf8')
+    for i in range(result_df.shape[0]):
+        data = result_df.iloc[i]
+        begin = data['时间']
+        end = data['unshift']
+        count = count + get_time_space_count(begin, end, 600) - 1
+        # if end is not None and end != np.nan:
+        #     q_list.append(f"{begin} ~ {end}")
+
+    result_dict[name] = count
+
+with open("缺失_数量.csv", 'w', encoding='utf8') as f:
+    for k, v in result_dict.items():
+        # v.insert(0, k)
+        # f.write(",".join(v) + "\n")
+        f.write(f"{k},{v}\n")

+ 205 - 0
tmp_file/gradio_web.py

@@ -0,0 +1,205 @@
+# -*- coding: utf-8 -*-
+# @Time    : 2024/6/3
+# @Author  : 魏志亮
+import copy
+
+import gradio as gr
+import yaml
+
+from service.plt_service import get_all_wind_company
+from service.trans_service import get_min_sec_conf_test
+
+
+# from utils.db.trans_mysql import *
+
+
+def test_click(wind_name, wind_full_name, type, is_vertical_table, merge_columns, vertical_read_cols,
+               vertical_index_cols, vertical_col_key, vertical_col_value, resolve_col_prefix, wind_name_exec,
+               wind_turbine_number, time_stamp, active_power, rotor_speed, generator_speed, wind_velocity,
+               pitch_angle_blade_1, pitch_angle_blade_2, pitch_angle_blade_3, cabin_position, true_wind_direction,
+               yaw_error1, set_value_of_active_power, gearbox_oil_temperature, generatordrive_end_bearing_temperature,
+               generatornon_drive_end_bearing_temperature, wind_turbine_status, wind_turbine_status2, cabin_temperature,
+               twisted_cable_angle, front_back_vibration_of_the_cabin, side_to_side_vibration_of_the_cabin,
+               actual_torque, given_torque, clockwise_yaw_count, counterclockwise_yaw_count, unusable,
+               power_curve_available, required_gearbox_speed, inverter_speed_master_control, outside_cabin_temperature,
+               main_bearing_temperature, gearbox_high_speed_shaft_bearing_temperature,
+               gearboxmedium_speed_shaftbearing_temperature, gearbox_low_speed_shaft_bearing_temperature,
+               generator_winding1_temperature, generator_winding2_temperature, generator_winding3_temperature,
+               turbulence_intensity, param1, param2, param3, param4, param5, param6, param7, param8, param9, param10
+               ):
+    params = copy.deepcopy(vars())
+
+    error_message = ""
+    if wind_name is None or wind_name.strip() == '':
+        error_message += "风机名称必选"
+        gr.Warning(error_message)
+        return error_message
+
+    if wind_full_name is None or wind_full_name.strip() == '':
+        error_message += "风机全称必选"
+        gr.Warning(error_message)
+        return error_message
+
+    # save_to_trans_conf(params)
+    return yaml.dump(vars(), allow_unicode=True, sort_keys=False)
+
+
+def fill_data(wind_name, type):
+    select_cols = ['wind_full_name', 'is_vertical_table', 'merge_columns', 'vertical_read_cols',
+                   'vertical_index_cols', 'vertical_col_key', 'vertical_col_value', 'resolve_col_prefix',
+                   'wind_name_exec',
+                   'wind_turbine_number', 'time_stamp', 'active_power', 'rotor_speed', 'generator_speed',
+                   'wind_velocity', 'pitch_angle_blade_1', 'pitch_angle_blade_2', 'pitch_angle_blade_3',
+                   'cabin_position', 'true_wind_direction', 'yaw_error1', 'set_value_of_active_power',
+                   'gearbox_oil_temperature', 'generatordrive_end_bearing_temperature',
+                   'generatornon_drive_end_bearing_temperature', 'wind_turbine_status', 'wind_turbine_status2',
+                   'cabin_temperature', 'twisted_cable_angle', 'front_back_vibration_of_the_cabin',
+                   'side_to_side_vibration_of_the_cabin', 'actual_torque', 'given_torque', 'clockwise_yaw_count',
+                   'counterclockwise_yaw_count', 'unusable', 'power_curve_available', 'required_gearbox_speed',
+                   'inverter_speed_master_control', 'outside_cabin_temperature', 'main_bearing_temperature',
+                   'gearbox_high_speed_shaft_bearing_temperature', 'gearboxmedium_speed_shaftbearing_temperature',
+                   'gearbox_low_speed_shaft_bearing_temperature', 'generator_winding1_temperature',
+                   'generator_winding2_temperature', 'generator_winding3_temperature', 'turbulence_intensity', 'param1',
+                   'param2', 'param3', 'param4', 'param5', 'param6', 'param7', 'param8', 'param9', 'param10']
+    print(wind_name, type)
+    df = get_min_sec_conf_test(wind_name, type)
+    print(df)
+    if df.keys() == 0:
+        return [''] * len(select_cols)
+    result = []
+    for col in select_cols:
+        result.append(df[col])
+    return result[0], result[1], result[2], result[3], result[4], result[5], result[6], result[7], \
+        result[8], result[9], \
+        result[10], result[11], result[12], result[13], result[14], result[15], result[16], result[17], result[18], \
+        result[19], result[20], result[21], result[22], result[23], result[24], result[25], result[26], result[27], \
+        result[28], result[29], result[30], result[31], result[32], result[33], result[34], result[35], result[36], \
+        result[37], result[38], result[39], result[40], result[41], result[42], result[43], result[44], result[45], \
+        result[46], result[47], result[48], result[49], result[50], result[51], result[52], result[53], result[54], \
+        result[55], result[56], result[57]
+
+
+with gr.Blocks(css=".container.svelte-1sk0pyu.svelte-1sk0pyu {width: 300px}", title='中能智能') as demo:
+    wind_name = gr.Dropdown(label="电场名称", choices=get_all_wind_company())
+
+    types = {
+        '分钟映射': 'minute', '秒映射': 'second'
+    }
+
+    for name in types.keys():
+        with gr.Tab(label=name):
+            type = gr.Text(label="映射类型", value=types[name], visible=False)
+            wind_full_name = gr.Textbox(label="完整的电场名称")
+            merge_columns = gr.Checkbox(label="是否需合并(多个excel列合并成一个才需要选择)", value=False)
+            is_vertical_table = gr.Checkbox(label="是否是竖表", value=False)
+            vertical_read_cols = gr.Textbox(label="竖表--读取的字段", placeholder="逗号分隔")
+            vertical_index_cols = gr.Textbox(label="竖表--分组的字段", placeholder="逗号分隔,一般都是时间,机组")
+            vertical_col_key = gr.Textbox(label="竖表--数据点字段")
+            vertical_col_value = gr.Textbox(label="竖表--数据点数值")
+            resolve_col_prefix = gr.Textbox(label="处理列名",
+                                            placeholder="比如重庆海装 25_#桨距角,只需要 桨距角 可以用 column[column.find('#')+1:]")
+
+            wind_name_exec = gr.Textbox(label="风机编号代码处理",
+                                        placeholder="比如 昌平001号风机,可以配置 wind_name.replace('昌平','').replace('号风机','')")
+
+            wind_turbine_number = gr.Textbox(label="风机编号(wind_turbine_number)")
+            time_stamp = gr.Textbox(label="时间戳(time_stamp)")
+            active_power = gr.Textbox(label="有功功率(active_power)")
+            rotor_speed = gr.Textbox(label="风轮转速(rotor_speed)")
+            generator_speed = gr.Textbox(label="发电机转速(generator_speed)")
+            wind_velocity = gr.Textbox(label="风速(wind_velocity)")
+            pitch_angle_blade_1 = gr.Textbox(label="桨距角1(pitch_angle_blade_1)")
+            pitch_angle_blade_2 = gr.Textbox(label="桨距角2(pitch_angle_blade_2)")
+            pitch_angle_blade_3 = gr.Textbox(label="桨距角3(pitch_angle_blade_3)")
+            cabin_position = gr.Textbox(label="机舱位置(cabin_position)")
+            true_wind_direction = gr.Textbox(label="绝对风向(true_wind_direction)")
+            yaw_error1 = gr.Textbox(label="对风角度(yaw_error1)")
+            set_value_of_active_power = gr.Textbox(label="有功功率设定值(set_value_of_active_power)")
+            gearbox_oil_temperature = gr.Textbox(label="齿轮箱油温(gearbox_oil_temperature)")
+            generatordrive_end_bearing_temperature = gr.Textbox(
+                label="发电机驱动端轴承温度(generatordrive_end_bearing_temperature)")
+            generatornon_drive_end_bearing_temperature = gr.Textbox(
+                label="发电机非驱动端轴承温度(generatornon_drive_end_bearing_temperature)")
+            wind_turbine_status = gr.Textbox(label="风机状态1(wind_turbine_status)")
+            wind_turbine_status2 = gr.Textbox(label="风机状态2(wind_turbine_status2)")
+            cabin_temperature = gr.Textbox(label="机舱内温度(cabin_temperature)")
+            twisted_cable_angle = gr.Textbox(label="扭缆角度(twisted_cable_angle)")
+            front_back_vibration_of_the_cabin = gr.Textbox(label="机舱前后振动(front_back_vibration_of_the_cabin)")
+            side_to_side_vibration_of_the_cabin = gr.Textbox(label="机舱左右振动(side_to_side_vibration_of_the_cabin)")
+            actual_torque = gr.Textbox(label="实际力矩(actual_torque)")
+            given_torque = gr.Textbox(label="给定力矩(given_torque)")
+            clockwise_yaw_count = gr.Textbox(label="顺时针偏航次数(clockwise_yaw_count)")
+            counterclockwise_yaw_count = gr.Textbox(label="逆时针偏航次数(counterclockwise_yaw_count)")
+            unusable = gr.Textbox(label="不可利用(unusable)")
+            power_curve_available = gr.Textbox(label="功率曲线可用(power_curve_available)")
+            required_gearbox_speed = gr.Textbox(label="齿轮箱转速(required_gearbox_speed)")
+            inverter_speed_master_control = gr.Textbox(label="变频器转速(主控)(inverter_speed_master_control)")
+            outside_cabin_temperature = gr.Textbox(label="环境温度(outside_cabin_temperature)")
+            main_bearing_temperature = gr.Textbox(label="主轴承轴承温度(main_bearing_temperature)")
+            gearbox_high_speed_shaft_bearing_temperature = gr.Textbox(
+                label="齿轮箱高速轴轴承温度(gearbox_high_speed_shaft_bearing_temperature)")
+            gearboxmedium_speed_shaftbearing_temperature = gr.Textbox(
+                label="齿轮箱中速轴轴承温度(gearboxmedium_speed_shaftbearing_temperature)")
+            gearbox_low_speed_shaft_bearing_temperature = gr.Textbox(
+                label="齿轮箱低速轴轴承温度(gearbox_low_speed_shaft_bearing_temperature)")
+            generator_winding1_temperature = gr.Textbox(label="发电机绕组1温度(generator_winding1_temperature)")
+            generator_winding2_temperature = gr.Textbox(label="发电机绕组2温度(generator_winding2_temperature)")
+            generator_winding3_temperature = gr.Textbox(label="发电机绕组3温度(generator_winding3_temperature)")
+            turbulence_intensity = gr.Textbox(label="湍流强度(turbulence_intensity)")
+            param1 = gr.Textbox(label="齿轮箱油压(param1)")
+            param2 = gr.Textbox(label="预留字段2(param2)")
+            param3 = gr.Textbox(label="预留字段3(param3)")
+            param4 = gr.Textbox(label="预留字段4(param4)")
+            param5 = gr.Textbox(label="预留字段5(param5)")
+            param6 = gr.Textbox(label="预留字段6(param6)")
+            param7 = gr.Textbox(label="预留字段7(param7)")
+            param8 = gr.Textbox(label="预留字段8(param8)")
+            param9 = gr.Textbox(label="预留字段9(param9)")
+            param10 = gr.Textbox(label="预留字段10(param10)")
+
+            button = gr.Button(value="提交")
+            result = gr.Textbox(label="结果")
+
+            button.click(fn=test_click,
+                         inputs=[wind_name, wind_full_name, type, is_vertical_table, merge_columns, vertical_read_cols,
+                                 vertical_index_cols, vertical_col_key, vertical_col_value, resolve_col_prefix,
+                                 wind_name_exec, wind_turbine_number, time_stamp, active_power, rotor_speed,
+                                 generator_speed, wind_velocity, pitch_angle_blade_1, pitch_angle_blade_2,
+                                 pitch_angle_blade_3, cabin_position, true_wind_direction, yaw_error1,
+                                 set_value_of_active_power, gearbox_oil_temperature,
+                                 generatordrive_end_bearing_temperature, generatornon_drive_end_bearing_temperature,
+                                 wind_turbine_status, wind_turbine_status2, cabin_temperature, twisted_cable_angle,
+                                 front_back_vibration_of_the_cabin, side_to_side_vibration_of_the_cabin, actual_torque,
+                                 given_torque, clockwise_yaw_count, counterclockwise_yaw_count, unusable,
+                                 power_curve_available, required_gearbox_speed, inverter_speed_master_control,
+                                 outside_cabin_temperature, main_bearing_temperature,
+                                 gearbox_high_speed_shaft_bearing_temperature,
+                                 gearboxmedium_speed_shaftbearing_temperature,
+                                 gearbox_low_speed_shaft_bearing_temperature, generator_winding1_temperature,
+                                 generator_winding2_temperature, generator_winding3_temperature, turbulence_intensity,
+                                 param1, param2, param3, param4, param5, param6, param7, param8, param9, param10
+                                 ], outputs=[result])
+            wind_name.change(fill_data, inputs=[wind_name, type],
+                             outputs=[wind_full_name, is_vertical_table, merge_columns, vertical_read_cols,
+                                      vertical_index_cols, vertical_col_key, vertical_col_value, resolve_col_prefix,
+                                      wind_name_exec, wind_turbine_number, time_stamp, active_power, rotor_speed,
+                                      generator_speed, wind_velocity, pitch_angle_blade_1, pitch_angle_blade_2,
+                                      pitch_angle_blade_3, cabin_position, true_wind_direction, yaw_error1,
+                                      set_value_of_active_power, gearbox_oil_temperature,
+                                      generatordrive_end_bearing_temperature,
+                                      generatornon_drive_end_bearing_temperature,
+                                      wind_turbine_status, wind_turbine_status2, cabin_temperature, twisted_cable_angle,
+                                      front_back_vibration_of_the_cabin, side_to_side_vibration_of_the_cabin,
+                                      actual_torque,
+                                      given_torque, clockwise_yaw_count, counterclockwise_yaw_count, unusable,
+                                      power_curve_available, required_gearbox_speed, inverter_speed_master_control,
+                                      outside_cabin_temperature, main_bearing_temperature,
+                                      gearbox_high_speed_shaft_bearing_temperature,
+                                      gearboxmedium_speed_shaftbearing_temperature,
+                                      gearbox_low_speed_shaft_bearing_temperature, generator_winding1_temperature,
+                                      generator_winding2_temperature, generator_winding3_temperature,
+                                      turbulence_intensity,
+                                      param1, param2, param3, param4, param5, param6, param7, param8, param9, param10])
+
+if __name__ == "__main__":
+    demo.launch(server_name='0.0.0.0', server_port=7860, auth=('znzn', "znzn123"))

+ 28 - 0
tmp_file/hebing_matlib_result.py

@@ -0,0 +1,28 @@
+from os import *
+
+import pandas as pd
+
+read_path = r"D:\data\电量损失及散点图"
+df = pd.DataFrame()
+
+cols = ['风机', '应发电量', '实发电量', '停机损失电量', '坏点+限电损失电量', '性能损失电量', '坏点损失电量', '限电损失电量', '超发电量', '应发电量百分比', '实发电量百分比',
+        '停机损失电量百分比', '坏点+限电损失电量百分比', '性能损失电量百分比', '坏点损失电量百分比', '限电损失电量百分比', '超发电量百分比', '平均风速', '可利用率']
+
+for root, dir, files in walk(read_path):
+    if files:
+        base_name = path.basename(root)
+        wind_df = pd.DataFrame()
+        print(root)
+        df1 = pd.read_excel(path.join(root, "EPPer.xls"), usecols=['应发电量百分比', '实发电量百分比',
+                                                                     '停机损失电量百分比', '坏点+限电损失电量百分比', '性能损失电量百分比',
+                                                                     '坏点损失电量百分比',
+                                                                     '限电损失电量百分比', '超发电量百分比', '平均风速', '可利用率'])
+        df2 = pd.read_excel(path.join(root, "EPKW.xls"),
+                            usecols=['应发电量', '实发电量', '停机损失电量', '坏点+限电损失电量', '性能损失电量', '坏点损失电量', '限电损失电量', '超发电量'])
+        wind_df = pd.concat([df1, df2], axis=1)
+        wind_df['风机'] = base_name
+        wind_df.reset_index(inplace=True)
+        print(wind_df.columns)
+        df = pd.concat([df, wind_df], ignore_index=True)
+
+df.to_csv("合并结果.csv", index=False, encoding='utf8', columns=cols)

+ 77 - 0
tmp_file/hebing_muti_batch.py

@@ -0,0 +1,77 @@
+import multiprocessing
+import sys
+from os import *
+
+import pandas as pd
+
+sys.path.insert(0, path.abspath(__file__).split("tmp_file")[0])
+
+
+def hebing_and_save(new_batch_save_path, name, paths):
+    df = pd.DataFrame()
+    for path in paths:
+        now_df = read_file_to_df(path)
+        df = pd.concat([df, now_df])
+
+    df.sort_values(by=['time_stamp'], inplace=True)
+
+    create_file_path(new_batch_save_path)
+    df.to_csv(path.join(new_batch_save_path, name), index=False, encoding='utf8')
+
+
+if __name__ == '__main__':
+
+    env = 'prod'
+    if len(sys.argv) >= 2:
+        env = sys.argv[1]
+
+    from utils.conf.read_conf import yaml_conf
+
+    conf_path = path.abspath(__file__).split("tmp_file")[0] + f"/conf/etl_config_{env}.yaml"
+    environ['ETL_CONF'] = conf_path
+    yaml_config = yaml_conf(conf_path)
+    environ['env'] = env
+
+    from utils.file.trans_methods import read_file_to_df, create_file_path
+
+    from etl.wind_power.fault_warn.FaultWarnTrans import FaultWarnTrans
+    from etl.wind_power.min_sec.MinSecTrans import MinSecTrans
+    from service.plt_service import get_hebing_data_by_batch_no_and_type
+
+    save_batch = 'WOF085500008-2-3'
+    save_batch_name = '合并'
+    trans_type = 'second'
+    read_batchs = ['WOF085500008-WOB000003', 'WOF085500008-WOB000002']
+    read_paths = list()
+
+    new_batch_save_path = ''
+
+    for read_data in read_batchs:
+        data = get_hebing_data_by_batch_no_and_type(read_data, trans_type)
+        save_db = True
+
+        exec_process = None
+        if data['transfer_type'] in ['second', 'minute']:
+            exec_process = MinSecTrans(data=data, save_db=save_db)
+
+        if data['transfer_type'] in ['fault', 'warn']:
+            exec_process = FaultWarnTrans(data=data, save_db=save_db)
+
+        if exec_process is None:
+            raise Exception("No exec process")
+
+        read_paths.append(exec_process.pathsAndTable.get_save_path())
+        new_batch_save_path = path.join(exec_process.pathsAndTable.save_path, save_batch + "_" + save_batch_name,
+                                           trans_type)
+
+    file_dict = dict()
+
+    for read_path in read_paths:
+        for file in listdir(read_path):
+            if file in file_dict:
+                file_dict[file].append(path.join(read_path, file))
+            else:
+                file_dict[file] = [path.join(read_path, file)]
+
+    with multiprocessing.Pool(len(file_dict.keys())) as pool:
+        pool.starmap(hebing_and_save, [(new_batch_save_path, name, paths) for name, paths in file_dict.items()])

+ 173 - 0
tmp_file/organize_xinhua_files.py

@@ -0,0 +1,173 @@
+import datetime
+import multiprocessing
+import warnings
+from os import *
+
+import numpy as np
+import pandas as pd
+
+warnings.filterwarnings("ignore")
+
+
+def __build_directory_dict(directory_dict, path, filter_types=None):
+    # 遍历目录下的所有项
+    for item in listdir(path):
+        item_path = path.join(path, item)
+        if path.isdir(item_path):
+            __build_directory_dict(directory_dict, item_path, filter_types=filter_types)
+        elif path.isfile(item_path):
+            if path not in directory_dict:
+                directory_dict[path] = []
+
+            if filter_types is None or len(filter_types) == 0:
+                directory_dict[path].append(item_path)
+            elif str(item_path).split(".")[-1] in filter_types:
+                if str(item_path).count("~$") == 0:
+                    directory_dict[path].append(item_path)
+
+
+# 读取路径下所有的excel文件
+def read_excel_files(read_path):
+    if path.isfile(read_path):
+        return [read_path]
+
+    directory_dict = {}
+    __build_directory_dict(directory_dict, read_path, filter_types=['xls', 'xlsx', 'csv', 'gz'])
+
+    return [path for paths in directory_dict.values() for path in paths if path]
+
+
+# 创建路径
+def create_file_path(path, is_file_path=False):
+    """
+    创建路径
+    :param path:创建文件夹的路径
+    :param is_file_path: 传入的path是否包含具体的文件名
+    """
+    if is_file_path:
+        path = path.dirname(path)
+
+    if not path.exists(path):
+        makedirs(path, exist_ok=True)
+
+
+def boolean_is_check_data(df_cols):
+    fault_list = ['快速停机', '故障名称', '故障代码', '故障停机', '人工停机', '风机紧急停机', '工作模式', '风机自身故障停机', '限功率运行状态']
+
+    df_cols = [str(i).split('_')[-1] for i in df_cols]
+    for fault in fault_list:
+        if fault in df_cols:
+            return True
+
+    return False
+
+
+def read_fle_to_df(file_path):
+    df = pd.read_excel(file_path)
+    wind_name = [i for i in df.columns if i.find('_') > -1][0].split('_')[0]
+    df.columns = [i.split('_')[-1] for i in df.columns]
+    df['wind_name'] = wind_name
+
+    return boolean_is_check_data(df.columns), wind_name, df
+
+
+def save_to_file(dfs, wind_name, save_path='', param='', is_check=False, all_cols=list(),
+                 result_data_list=multiprocessing.Manager().list()):
+    try:
+        if is_check:
+            df = pd.concat(dfs)
+        else:
+            df = dfs[0]
+            for index, now_df in enumerate(dfs):
+                if index > 0:
+                    df = pd.merge(df, now_df, on=['采样时间', 'wind_name'], how='outer')
+    except Exception as e:
+        print(wind_name, e)
+        raise e
+
+    df.reset_index(inplace=True)
+    df.drop_duplicates(inplace=True, subset=['采样时间', 'wind_name'])
+    if 'index' in df.columns:
+        del df['index']
+    create_file_path(save_path)
+    df.sort_values(by='采样时间', inplace=True)
+
+    loss_cols = list([i for i in df.columns if i != 'wind_name'])
+    loss_cols.sort()
+
+    loss_cols.insert(0, wind_name)
+    loss_cols.insert(0, path.basename(save_path) + '-' + param)
+
+    result_data_list.append(loss_cols)
+
+    for col in set(all_cols):
+        if col not in df.columns:
+            df[col] = np.nan
+
+    df.to_csv(path.join(save_path, param, wind_name + '.csv'), encoding='utf8', index=False)
+
+
+if __name__ == '__main__':
+    begin = datetime.datetime.now()
+    # dir1 = r'D:\data\新华水电\测试'
+    # save_path = r'D:\data\新华水电\整理数据'
+    result_datas = [
+        (r'/data/download/collection_data/1进行中/新华水电/风机SCADA数据/8月风机数据',
+         r'/data/download/collection_data/1进行中/新华水电/整理数据/8月'),
+        (r'/data/download/collection_data/1进行中/新华水电/风机SCADA数据/9月风机数据',
+         r'/data/download/collection_data/1进行中/新华水电/整理数据/9月')
+    ]
+
+    result_data_list = multiprocessing.Manager().list()
+
+    for dir1, save_path in result_datas:
+        files = read_excel_files(dir1)
+        with multiprocessing.Pool(30) as pool:
+            datas = pool.starmap(read_fle_to_df, [(file,) for file in files])
+        data_wind_name = dict()
+        check_wind_name = dict()
+
+        data_all_cols = list()
+        check_all_cols = list()
+        for data in datas:
+            check_data, wind_name, df = data[0], data[1], data[2]
+
+            if '工作模式' not in df.columns:
+                # df.reset_index(inplace=True)
+                # df.set_index(keys=['采样时间'], inplace=True)
+                if check_data:
+                    check_all_cols.extend(list(df.columns))
+                    if wind_name in check_wind_name.keys():
+                        check_wind_name[wind_name].append(df)
+                    else:
+                        check_wind_name[wind_name] = [df]
+                else:
+                    data_all_cols.extend(list(df.columns))
+                    if wind_name in data_wind_name.keys():
+                        data_wind_name[wind_name].append(df)
+                    else:
+                        data_wind_name[wind_name] = [df]
+
+        # with multiprocessing.Pool(30) as pool:
+        #     pool.starmap(combine_df,
+        #                  [(dfs, wind_name, save_path, "事件数据", True, check_all_cols, result_data_list) for wind_name, dfs
+        #                   in
+        #                   check_wind_name.items()])
+
+        with multiprocessing.Pool(30) as pool:
+            pool.starmap(save_to_file,
+                         [(dfs, wind_name, save_path, "数据", False, data_all_cols, result_data_list) for wind_name, dfs
+                          in
+                          data_wind_name.items()])
+
+        print(datetime.datetime.now() - begin)
+
+    normal_list = list(result_data_list)
+    normal_list.sort(key=lambda x: (x[0], int(x[1][2:])))
+
+    with open('loss_col.csv', 'w', encoding='utf8') as f:
+        for datas in normal_list:
+            f.write(",".join(datas))
+            f.write('\n')
+
+    print(datetime.datetime.now() - begin)

+ 205 - 0
tmp_file/organize_xinhua_files_data.py

@@ -0,0 +1,205 @@
+import datetime
+import multiprocessing
+import warnings
+from os import *
+
+import pandas as pd
+
+warnings.filterwarnings("ignore")
+
+
+def __build_directory_dict(directory_dict, path, filter_types=None):
+    # 遍历目录下的所有项
+    for item in listdir(path):
+        item_path = path.join(path, item)
+        if path.isdir(item_path):
+            __build_directory_dict(directory_dict, item_path, filter_types=filter_types)
+        elif path.isfile(item_path):
+            if path not in directory_dict:
+                directory_dict[path] = []
+            if filter_types is None or len(filter_types) == 0:
+                directory_dict[path].append(item_path)
+            elif str(item_path).split(".")[-1] in filter_types:
+                if str(item_path).count("~$") == 0:
+                    directory_dict[path].append(item_path)
+
+
+# 读取路径下所有的excel文件
+def read_excel_files(read_path):
+    if path.isfile(read_path):
+        return [read_path]
+    directory_dict = {}
+    __build_directory_dict(directory_dict, read_path, filter_types=['xls', 'xlsx', 'csv', 'gz'])
+    return [path for paths in directory_dict.values() for path in paths if path]
+
+
+# 创建路径
+def create_file_path(path, is_file_path=False):
+    """
+    创建路径
+    :param path:创建文件夹的路径
+    :param is_file_path: 传入的path是否包含具体的文件名
+    """
+    if is_file_path:
+        path = path.dirname(path)
+    if not path.exists(path):
+        makedirs(path, exist_ok=True)
+
+
+def boolean_is_check_data(df_cols, need_valid=True):
+    if not need_valid:
+        return True
+    fault_list = ['快速停机', '故障名称', '故障代码', '故障停机', '人工停机', '风机紧急停机', '风机自身故障停机', '限功率运行状态']
+    df_cols = [str(i).split('_')[-1] for i in df_cols]
+    for fault in fault_list:
+        if fault in df_cols:
+            return True
+    return False
+
+
+def read_fle_to_df(file_path):
+    df = pd.read_excel(file_path)
+    wind_name = [i for i in df.columns if i.find('_') > -1][0].split('_')[0]
+    df.columns = [i.split('_')[-1] for i in df.columns]
+    df['wind_name'] = wind_name
+    df['采样时间'] = pd.to_datetime(df['采样时间'])
+    df['采样时间'] = df['采样时间'].dt.ceil('T')
+    return boolean_is_check_data(df.columns, file_path.find('批次') > -1), wind_name, df
+
+
+def read_guzhangbaojing(file_path):
+    try:
+        df = pd.read_excel(file_path)
+        df.rename(columns={'风机名': 'wind_name'}, inplace=True)
+        df['采样时间'] = pd.to_datetime(df['采样时间'])
+        df['采样时间'] = df['采样时间'].dt.ceil('T')
+        df = df[(df['采样时间'] >= '2024-08-01 00:00:00') & (df['采样时间'] < '2024-10-01 00:00:00')]
+        return df
+    except Exception as e:
+        print(file_path, e)
+        raise e
+
+
+def combine_df(dfs, wind_name, save_path=''):
+    print(wind_name)
+    cols = list()
+    col_map = dict()
+    try:
+        df = dfs[0]
+        cols.extend(df.columns)
+        for index, now_df in enumerate(dfs):
+            if index > 0:
+                for col in now_df.columns:
+                    if col in cols and col not in ['采样时间', 'wind_name']:
+                        if col in col_map.keys():
+                            count = col_map[col]
+                            col_map[col] = count + 1
+                        else:
+                            count = 1
+                            col_map[col] = 1
+                        now_df.rename(columns={col: col + '__' + str(count)}, inplace=True)
+                df = pd.merge(df, now_df, on=['采样时间', 'wind_name'], how='outer')
+                cols.extend(now_df.columns)
+    except Exception as e:
+        print(wind_name, e)
+        raise e
+    df.reset_index(inplace=True)
+    df.drop_duplicates(inplace=True, subset=['采样时间', 'wind_name'])
+    if 'index' in df.columns:
+        del df['index']
+    create_file_path(save_path)
+    df.sort_values(by='采样时间', inplace=True)
+    df.set_index(keys=['采样时间', 'wind_name'], inplace=True)
+    return wind_name, df
+
+
+def sae_to_csv(wind_name, df):
+    try:
+        col_tuples = [(col.split('__')[0], col) for col in df.columns if col.find('__') > -1]
+        col_dict = dict()
+        for origin, col in col_tuples:
+            if origin in col_dict.keys():
+                col_dict[origin].add(col)
+            else:
+                col_dict[origin] = {col}
+
+        for origin, cols in col_dict.items():
+            print(wind_name, origin, cols)
+            if pd.api.types.is_numeric_dtype(df[origin]):
+                df[origin] = df[list(cols)].max(axis=1)
+            else:
+                df[origin] = df[list(cols)].apply(lambda x: [i for i in x.values if i][0], axis=1)
+            for col in cols:
+                if col != origin:
+                    del df[col]
+
+        df.to_csv(path.join(save_path, wind_name + '.csv'), encoding='utf8')
+
+    except Exception as e:
+        print(wind_name, df.columns)
+        raise e
+
+
+if __name__ == '__main__':
+    begin = datetime.datetime.now()
+
+    base_path = r'/data/download/collection_data/1进行中/新华水电/收资数据/风机SCADA数据'
+
+    dir1 = base_path + r'/data'
+    dir2 = base_path + r'/故障报警/汇能机组数据-故障'
+    dir3 = base_path + r'/故障报警/报警'
+    save_path = r'/data/download/collection_data/1进行中/新华水电/清理数据/合并批次1-2故障报警'
+
+    create_file_path(save_path)
+
+    # result_datas = [
+    #     (r'/data/download/collection_data/1进行中/新华水电/风机SCADA数据',
+    #      r'/data/download/collection_data/1进行中/新华水电/整理数据/批次1-2合并'),
+    # ]
+
+    data_wind_name = dict()
+    files = read_excel_files(dir1)
+    with multiprocessing.Pool(30) as pool:
+        datas = pool.starmap(read_fle_to_df, [(file,) for file in files])
+    for data in datas:
+        check_data, wind_name, df = data[0], data[1], data[2]
+        if wind_name in data_wind_name.keys():
+            data_wind_name[wind_name].append(df)
+        else:
+            data_wind_name[wind_name] = [df]
+
+    with multiprocessing.Pool(30) as pool:
+        data_dfs = pool.starmap(combine_df,
+                                [(dfs, wind_name, save_path) for wind_name, dfs
+                                 in
+                                 data_wind_name.items()])
+
+    result_data_dict = dict()
+    for wind_name, df in data_dfs:
+        result_data_dict[wind_name] = df
+
+    for dir4 in [dir2, dir3]:
+        guzhang_files = read_excel_files(dir4)
+        with multiprocessing.Pool(30) as pool:
+            guzhang_datas = pool.starmap(read_guzhangbaojing, [(file,) for file in guzhang_files])
+        guzhang_df = pd.DataFrame()
+        for df in guzhang_datas:
+            if not df.empty:
+                guzhang_df = pd.concat([guzhang_df, df])
+        wind_names = set(list(guzhang_df['wind_name'].values))
+        for wind_name in wind_names:
+            now_df = guzhang_df[guzhang_df['wind_name'] == wind_name]
+            if wind_name in result_data_dict.keys():
+                now_df.reset_index(inplace=True)
+                now_df.drop_duplicates(inplace=True, subset=['采样时间', 'wind_name'])
+                if 'index' in now_df.columns:
+                    del now_df['index']
+                now_df.sort_values(by='采样时间', inplace=True)
+                now_df.set_index(keys=['采样时间', 'wind_name'], inplace=True)
+                res_df = result_data_dict[wind_name]
+                result_data_dict[wind_name] = pd.concat([res_df, now_df], axis=1)
+
+    with multiprocessing.Pool(30) as pool:
+        pool.starmap(sae_to_csv, [(wind_name, df) for wind_name, df in result_data_dict.items()])
+
+    print(datetime.datetime.now() - begin)

+ 97 - 0
tmp_file/orgranize_hongyang.py

@@ -0,0 +1,97 @@
+import copy
+import multiprocessing
+import warnings
+from os import *
+
+import chardet
+import pandas as pd
+
+warnings.filterwarnings("ignore")
+
+# read_path = r'/home/wzl/test_data/红阳'
+# save_dir = r'/home/wzl/test_data/整理'
+
+read_path = r'D:\data\红阳\红阳秒级分测点\红阳'
+save_dir = r'D:\data\红阳\红阳秒级分测点\整理'
+
+def __build_directory_dict(directory_dict, path, filter_types=None):
+    # 遍历目录下的所有项
+    for item in listdir(path):
+        item_path = path.join(path, item)
+        if path.isdir(item_path):
+            __build_directory_dict(directory_dict, item_path, filter_types=filter_types)
+        elif path.isfile(item_path):
+            if path not in directory_dict:
+                directory_dict[path] = []
+
+            if filter_types is None or len(filter_types) == 0:
+                directory_dict[path].append(item_path)
+            elif str(item_path).split(".")[-1] in filter_types:
+                if str(item_path).count("~$") == 0:
+                    directory_dict[path].append(item_path)
+
+
+# 读取路径下所有的excel文件
+def read_excel_files(read_path):
+    if path.isfile(read_path):
+        return [read_path]
+
+    directory_dict = {}
+    __build_directory_dict(directory_dict, read_path, filter_types=['xls', 'xlsx', 'csv', 'gz'])
+
+    return [path for paths in directory_dict.values() for path in paths if path]
+
+
+all_files = read_excel_files(read_path)
+
+
+# 获取文件编码
+def detect_file_encoding(filename):
+    # 读取文件的前1000个字节(足够用于大多数编码检测)
+    with open(filename, 'rb') as f:
+        rawdata = f.read(1000)
+    result = chardet.detect(rawdata)
+    encoding = result['encoding']
+
+    if encoding is None:
+        encoding = 'gb18030'
+
+    if encoding.lower() in ['utf-8', 'ascii', 'utf8']:
+        return 'utf-8'
+
+    return 'gb18030'
+
+
+def read_and_organize(file):
+    df = pd.read_csv(file, encoding=detect_file_encoding(file))
+    return file, df
+
+
+if __name__ == '__main__':
+
+    with multiprocessing.Pool(10) as pool:
+        bak_datas = pool.starmap(read_and_organize, [(i,) for i in all_files])
+
+    datas = copy.deepcopy(bak_datas)
+    wind_name_df = dict()
+    for file, df in datas:
+        all_cols = [i for i in df.columns if i.find('#') > -1]
+        col = all_cols[0]
+        cedian = str(col).split("_")[-1]
+        wind_names = set([str(i).split("#")[0].replace("红阳风电场_", "") for i in all_cols])
+
+        print(file, df.columns)
+        for wind_name in wind_names:
+            cols = [i for i in all_cols if i.find('_' + wind_name) > -1]
+            cols.insert(0, '统计时间')
+            query_df = df[cols]
+            query_df.columns = [str(i).split('_')[-1] for i in query_df.columns]
+            query_df['风机编号'] = wind_name
+            if wind_name in wind_name_df.keys():
+                now_df = wind_name_df[wind_name]
+                wind_name_df[wind_name] = pd.merge(now_df, query_df, on=['统计时间', '风机编号'], how='outer')
+            else:
+                wind_name_df[wind_name] = query_df
+
+    for wind_name, df in wind_name_df.items():
+        df.to_csv(path.join(save_dir, wind_name + '#.csv'), index=False, encoding='utf8')

+ 91 - 0
tmp_file/power_derating.py

@@ -0,0 +1,91 @@
+import multiprocessing
+from os import *
+
+import matplotlib
+
+matplotlib.use('Agg')
+matplotlib.rcParams['font.family'] = 'SimHei'
+matplotlib.rcParams['font.sans-serif'] = ['SimHei']
+
+import numpy as np
+from matplotlib import pyplot as plt
+
+from utils.file.trans_methods import read_file_to_df
+from utils.file.trans_methods import read_excel_files
+import pandas as pd
+
+
+def select_data(file, curve_wv, curve_ap, save_path):
+    name = path.basename(file).split("@")[0]
+    try:
+        df = read_file_to_df(file)
+        df.dropna(subset=['有功功率 kW均值', '风速 m/s均值', '有功功率设定 kW均值'], inplace=True)
+        ap_gt_0_df = df[df['有功功率 kW均值'] > 0]
+        ap_le_0_df = df[df['有功功率 kW均值'] <= 0]
+        ap_le_0_df["marker"] = -1
+
+        ap = ap_gt_0_df['有功功率 kW均值'].values
+        wv = ap_gt_0_df['风速 m/s均值'].values
+        ap_set = ap_gt_0_df['有功功率设定 kW均值'].values
+
+        ap_gt_0_in = [0] * ap_gt_0_df.shape[0]
+
+        for i in range(len(ap_set)):
+            wind_speed = wv[i]
+            active_power = ap[i]
+            active_power_set = ap_set[i]
+
+            if active_power >= 2200 - 200:
+                ap_gt_0_in[i] = 1
+            else:
+                diffs = np.abs(curve_wv - wind_speed)
+                # 找到差值最小的索引和对应的差值
+                minDiff, idx = np.min(diffs), np.argmin(diffs)
+
+                # 使用找到的索引获取对应的值
+                closestValue = curve_ap[idx]
+                if active_power - closestValue >= -100:
+                    ap_gt_0_in[i] = 1
+
+        ap_gt_0_df['marker'] = ap_gt_0_in
+        df = pd.concat([ap_gt_0_df, ap_le_0_df])
+
+        df.to_csv(path.join(save_path, name + '.csv'), index=False, encoding='utf-8')
+
+        df = df[['时间', '风速 m/s均值', '有功功率 kW均值', '有功功率设定 kW均值', 'marker']]
+
+        df = df[df['marker'] == 1]
+
+        x = df['风速 m/s均值'].values
+        y = df['有功功率 kW均值'].values
+        # 使用scatter函数绘制散点图
+        if not df.empty:
+            plt.scatter(x, y, s=10, c='blue')
+
+            # 添加标题和坐标轴标签
+            plt.title(name)
+            plt.xlabel('风速均值')
+            plt.ylabel('有功功率均值')
+
+            # 保存
+            plt.savefig(path.join(save_path, name + '均值.png'))
+
+    except Exception as e:
+        print(path.basename(file), "出错", str(e))
+        raise e
+
+
+if __name__ == '__main__':
+    wind_power_df = read_file_to_df(r"D:\中能智能\matlib计算相关\标记derating\PV_Curve.csv")
+    curve_wv = wind_power_df["风速"].values
+    curve_ap = wind_power_df["功率"].values
+
+    all_files = read_excel_files(r"Z:\collection_data\1进行中\诺木洪风电场-甘肃-华电\清理数据\min-666")
+    save_path = r"D:\trans_data\诺木洪\清理数据\min-666-derating"
+
+    # save_path = r"Z:\collection_data\1进行中\诺木洪风电场-甘肃-华电\清理数据\min-666-marker"
+
+    # for file in all_files:
+
+    with multiprocessing.Pool(10) as pool:
+        pool.starmap(select_data, [(i, curve_wv, curve_ap, save_path) for i in all_files])

+ 90 - 0
tmp_file/power_derating_biaozhun.py

@@ -0,0 +1,90 @@
+from os import *
+
+import matplotlib
+import numpy as np
+
+from utils.draw.draw_file import scatter
+
+matplotlib.use('Agg')
+matplotlib.rcParams['font.family'] = 'SimHei'  # 或者 'Microsoft YaHei'
+matplotlib.rcParams['font.sans-serif'] = ['SimHei']  # 或者 ['Microsoft YaHei']
+
+from utils.file.trans_methods import read_file_to_df
+from utils.file.trans_methods import read_excel_files
+import pandas as pd
+
+
+class ContractPowerCurve(object):
+
+    def __init__(self, df: pd.DataFrame, wind_velocity='风速', active_power='功率'):
+        self.df = df
+        self.wind_velocity = wind_velocity
+        self.active_power = active_power
+
+
+def marker_active_power(contract_power_curve_class: ContractPowerCurve, df: pd.DataFrame, active_power='有功功率 kW均值',
+                        wind_velocity='风速 m/s均值'):
+    """
+    标记有功功率为正的记录
+    :param contract_power_curve_class: 合同功率曲线
+    :param df: 原始数据
+    :return: 标记有功功率为正的原始数据
+    """
+    contract_power_curve_df = contract_power_curve_class.df
+    curve_wv = contract_power_curve_df[contract_power_curve_class.wind_velocity].values
+    curve_ap = contract_power_curve_df[contract_power_curve_class.active_power].values
+
+    df.dropna(subset=[active_power, wind_velocity], inplace=True)
+    ap_gt_0_df = df[df[active_power] > 0]
+    ap_le_0_df = df[df[active_power] <= 0]
+    ap_le_0_df["marker"] = -1
+
+    active_power_values = ap_gt_0_df[active_power].values
+    wind_speed_values = ap_gt_0_df[wind_velocity].values
+    ap_gt_0_in = [0] * ap_gt_0_df.shape[0]
+
+    for i in range(len(ap_gt_0_in)):
+        wind_speed = wind_speed_values[i]
+        active_power = active_power_values[i]
+
+        # if active_power >= 2200 - 200:
+        #     ap_gt_0_in[i] = 1
+        # else:
+        diffs = np.abs(curve_wv - wind_speed)
+        # 找到差值最小的索引和对应的差值
+        minDiff, idx = np.min(diffs), np.argmin(diffs)
+
+        # 使用找到的索引获取对应的值
+        closestValue = curve_ap[idx]
+        if active_power - closestValue >= -100:
+            ap_gt_0_in[i] = 1
+
+    ap_gt_0_df['marker'] = ap_gt_0_in
+    return pd.concat([ap_gt_0_df, ap_le_0_df])
+
+
+if __name__ == '__main__':
+    wind_power_df = read_file_to_df(r"D:\中能智能\matlib计算相关\标记derating\PV_Curve.csv")
+
+    all_files = read_excel_files(r"Z:\collection_data\1进行中\诺木洪风电场-甘肃-华电\清理数据\min-666")
+    save_path = r"D:\trans_data\诺木洪\清理数据\min-666-derating"
+
+    wind_power_df_class = ContractPowerCurve(wind_power_df)
+
+    for file in all_files:
+        name = path.basename(file).split("@")[0]
+        try:
+            df = read_file_to_df(file)
+            df = marker_active_power(wind_power_df_class, df)
+            df = df[df['marker'] == 1]
+            df.to_csv(path.join(save_path, name + '.csv'), index=False, encoding='utf-8')
+
+            # 使用scatter函数绘制散点图
+            if not df.empty:
+                scatter(name, x_label='风速均值', y_label='有功功率均值', x_values=df['风速 m/s均值'].values,
+                        y_values=df['有功功率 kW均值'].values, color='green',
+                        save_file_path=path.join(save_path, name + '均值.png'))
+
+        except Exception as e:
+            print(path.basename(file), "出错", str(e))
+            raise e

+ 213 - 0
tmp_file/power_derating_for_chunlin.py

@@ -0,0 +1,213 @@
+from os import *
+
+import matplotlib
+import numpy as np
+from matplotlib import pyplot as plt
+
+matplotlib.use('Agg')
+matplotlib.rcParams['font.family'] = 'SimHei'  # 或者 'Microsoft YaHei'
+matplotlib.rcParams['font.sans-serif'] = ['SimHei']  # 或者 ['Microsoft YaHei']
+
+import pandas as pd
+import chardet
+import warnings
+
+warnings.filterwarnings("ignore")
+
+
+# 获取文件编码
+def detect_file_encoding(filename):
+    # 读取文件的前1000个字节(足够用于大多数编码检测)
+    with open(filename, 'rb') as f:
+        rawdata = f.read(1000)
+    result = chardet.detect(rawdata)
+    encoding = result['encoding']
+
+    if encoding is None:
+        encoding = 'gb18030'
+
+    if encoding and encoding.lower() == 'gb2312' or encoding.lower().startswith("windows"):
+        encoding = 'gb18030'
+    return encoding
+
+
+def del_blank(df=pd.DataFrame(), cols=list()):
+    for col in cols:
+        if df[col].dtype == object:
+            df[col] = df[col].str.strip()
+    return df
+
+
+# 切割数组到多个数组
+def split_array(array, num):
+    return [array[i:i + num] for i in range(0, len(array), num)]
+
+
+# 读取数据到df
+def read_file_to_df(file_path, read_cols=list(), header=0):
+    try:
+        df = pd.DataFrame()
+        if str(file_path).lower().endswith("csv") or str(file_path).lower().endswith("gz"):
+            encoding = detect_file_encoding(file_path)
+            end_with_gz = str(file_path).lower().endswith("gz")
+            if read_cols:
+                if end_with_gz:
+                    df = pd.read_csv(file_path, encoding=encoding, usecols=read_cols, compression='gzip', header=header)
+                else:
+                    df = pd.read_csv(file_path, encoding=encoding, usecols=read_cols, header=header,
+                                     on_bad_lines='warn')
+            else:
+
+                if end_with_gz:
+                    df = pd.read_csv(file_path, encoding=encoding, compression='gzip', header=header)
+                else:
+                    df = pd.read_csv(file_path, encoding=encoding, header=header, on_bad_lines='warn')
+
+        else:
+            xls = pd.ExcelFile(file_path)
+            # 获取所有的sheet名称
+            sheet_names = xls.sheet_names
+            for sheet in sheet_names:
+                if read_cols:
+                    now_df = pd.read_excel(xls, sheet_name=sheet, header=header, usecols=read_cols)
+                else:
+                    now_df = pd.read_excel(xls, sheet_name=sheet, header=header)
+
+                df = pd.concat([df, now_df])
+
+        print('文件读取成功', file_path, '文件数量', df.shape)
+    except Exception as e:
+        print('读取文件出错', file_path, str(e))
+        message = '文件:' + path.basename(file_path) + ',' + str(e)
+        raise ValueError(message)
+
+    return df
+
+
+def __build_directory_dict(directory_dict, path, filter_types=None):
+    # 遍历目录下的所有项
+    for item in listdir(path):
+        item_path = path.join(path, item)
+        if path.isdir(item_path):
+            __build_directory_dict(directory_dict, item_path, filter_types=filter_types)
+        elif path.isfile(item_path):
+            if path not in directory_dict:
+                directory_dict[path] = []
+
+            if filter_types is None or len(filter_types) == 0:
+                directory_dict[path].append(item_path)
+            elif str(item_path).split(".")[-1] in filter_types:
+                if str(item_path).count("~$") == 0:
+                    directory_dict[path].append(item_path)
+
+    # 读取所有文件
+
+
+# 读取路径下所有的excel文件
+def read_excel_files(read_path):
+    directory_dict = {}
+    __build_directory_dict(directory_dict, read_path, filter_types=['xls', 'xlsx', 'csv', 'gz'])
+
+    return [path for paths in directory_dict.values() for path in paths if path]
+
+
+class ContractPowerCurve(object):
+
+    def __init__(self, df: pd.DataFrame, wind_velocity='风速', active_power='功率'):
+        self.df = df
+        self.wind_velocity = wind_velocity
+        self.active_power = active_power
+
+
+# 创建路径
+def create_file_path(path, is_file_path=False):
+    if is_file_path:
+        path = path.dirname(path)
+
+    if not path.exists(path):
+        makedirs(path, exist_ok=True)
+
+
+def scatter(title, x_label, y_label, x_values, y_values, color='blue', size=10, save_file_path=''):
+    if save_file_path:
+        create_file_path(save_file_path, True)
+    else:
+        save_file_path = title + '.png'
+
+    plt.figure(figsize=(8, 6))
+    plt.title(title, fontsize=16)
+    plt.xlabel(x_label, fontsize=14)
+    plt.ylabel(y_label, fontsize=14)
+    plt.scatter(x_values, y_values, s=size, c=color)
+    plt.savefig(save_file_path)
+    plt.close()
+
+
+def marker_active_power(contract_power_curve_class: ContractPowerCurve, df: pd.DataFrame, active_power='有功功率 kW均值',
+                        wind_velocity='风速 m/s均值'):
+    """
+    标记有功功率为正的记录
+    :param contract_power_curve_class: 合同功率曲线
+    :param df: 原始数据
+    :return: 标记有功功率为正的原始数据
+    """
+    contract_power_curve_df = contract_power_curve_class.df
+    curve_wv = contract_power_curve_df[contract_power_curve_class.wind_velocity].values
+    curve_ap = contract_power_curve_df[contract_power_curve_class.active_power].values
+
+    df.dropna(subset=[active_power, wind_velocity], inplace=True)
+    ap_gt_0_df = df[df[active_power] > 0]
+    ap_le_0_df = df[df[active_power] <= 0]
+    ap_le_0_df["marker"] = -1
+
+    active_power_values = ap_gt_0_df[active_power].values
+    wind_speed_values = ap_gt_0_df[wind_velocity].values
+    ap_gt_0_in = [0] * ap_gt_0_df.shape[0]
+
+    for i in range(len(ap_gt_0_in)):
+        wind_speed = wind_speed_values[i]
+        active_power = active_power_values[i]
+
+        # if active_power >= 2200 - 200:
+        #     ap_gt_0_in[i] = 1
+        # else:
+        diffs = np.abs(curve_wv - wind_speed)
+        # 找到差值最小的索引和对应的差值
+        minDiff, idx = np.min(diffs), np.argmin(diffs)
+
+        # 使用找到的索引获取对应的值
+        closestValue = curve_ap[idx]
+        if active_power - closestValue >= -100:
+            ap_gt_0_in[i] = 1
+
+    ap_gt_0_df['marker'] = ap_gt_0_in
+    return pd.concat([ap_gt_0_df, ap_le_0_df])
+
+
+if __name__ == '__main__':
+    wind_power_df = read_file_to_df(r"D:\中能智能\matlib计算相关\标记derating\PV_Curve.csv")
+
+    all_files = read_excel_files(r"Z:\collection_data\1进行中\诺木洪风电场-甘肃-华电\清理数据\min-666")
+    save_path = r"D:\trans_data\诺木洪\清理数据\min-666-derating"
+
+    wind_power_df_class = ContractPowerCurve(wind_power_df)
+
+    for file in all_files:
+        name = path.basename(file).split("@")[0]
+        try:
+            df = read_file_to_df(file)
+            df = marker_active_power(wind_power_df_class, df)
+            df = df[df['marker'] == 1]
+            # 保存筛选后数据
+            name = name.replace('HD', 'HD2')
+            df.to_csv(path.join(save_path, name + '.csv'), index=False, encoding='utf-8')
+
+            # 使用scatter函数绘制散点图
+            if not df.empty:
+                scatter(name, x_label='风速均值', y_label='有功功率均值', x_values=df['风速 m/s均值'].values,
+                        y_values=df['有功功率 kW均值'].values, color='green',
+                        save_file_path=path.join(save_path, name + '均值.png'))
+
+        except Exception as e:
+            print(path.basename(file), "出错", str(e))
+            raise e

+ 262 - 0
tmp_file/pv_youxiaoxing.py

@@ -0,0 +1,262 @@
+import multiprocessing
+from os import *
+
+import matplotlib
+
+matplotlib.use('Agg')
+matplotlib.rcParams['font.family'] = 'SimHei'  # 或者 'Microsoft YaHei'
+matplotlib.rcParams['font.sans-serif'] = ['SimHei']  # 或者 ['Microsoft YaHei']
+
+import chardet
+import warnings
+
+warnings.filterwarnings("ignore")
+
+import datetime
+
+import pandas as pd
+
+
+def get_time_space(df, time_str):
+    """
+    :return: 查询时间间隔
+    """
+    begin = datetime.datetime.now()
+    df1 = pd.DataFrame(df[time_str])
+    df1[time_str] = pd.to_datetime(df1[time_str], errors='coerce')
+    df1.sort_values(by=time_str, inplace=True)
+    df1['chazhi'] = df1[time_str].shift(-1) - df1[time_str]
+    result = df1.sample(int(df1.shape[0] / 100))['chazhi'].value_counts().idxmax().seconds
+    del df1
+    print(datetime.datetime.now() - begin)
+    return abs(result)
+
+
+def get_time_space_count(start_time: datetime.datetime, end_time: datetime.datetime, time_space=1):
+    """
+    获取俩个时间之间的个数
+    :return: 查询时间间隔
+    """
+    delta = end_time - start_time
+    total_seconds = delta.days * 24 * 60 * 60 + delta.seconds
+
+    return abs(int(total_seconds / time_space)) + 1
+
+
+# 获取文件编码
+def detect_file_encoding(filename):
+    # 读取文件的前1000个字节(足够用于大多数编码检测)
+    with open(filename, 'rb') as f:
+        rawdata = f.read(1000)
+    result = chardet.detect(rawdata)
+    encoding = result['encoding']
+
+    if encoding is None:
+        encoding = 'gb18030'
+
+    if encoding and encoding.lower() == 'gb2312' or encoding.lower().startswith("windows"):
+        encoding = 'gb18030'
+    return encoding
+
+
+def del_blank(df=pd.DataFrame(), cols=list()):
+    for col in cols:
+        if df[col].dtype == object:
+            df[col] = df[col].str.strip()
+    return df
+
+
+# 切割数组到多个数组
+def split_array(array, num):
+    return [array[i:i + num] for i in range(0, len(array), num)]
+
+
+# 读取数据到df
+def read_file_to_df(file_path, read_cols=list(), header=0):
+    try:
+        df = pd.DataFrame()
+        if str(file_path).lower().endswith("csv") or str(file_path).lower().endswith("gz"):
+            encoding = detect_file_encoding(file_path)
+            end_with_gz = str(file_path).lower().endswith("gz")
+            if read_cols:
+                if end_with_gz:
+                    df = pd.read_csv(file_path, encoding=encoding, usecols=read_cols, compression='gzip', header=header)
+                else:
+                    df = pd.read_csv(file_path, encoding=encoding, usecols=read_cols, header=header,
+                                     on_bad_lines='warn')
+            else:
+
+                if end_with_gz:
+                    df = pd.read_csv(file_path, encoding=encoding, compression='gzip', header=header)
+                else:
+                    df = pd.read_csv(file_path, encoding=encoding, header=header, on_bad_lines='warn')
+
+        else:
+            xls = pd.ExcelFile(file_path)
+            # 获取所有的sheet名称
+            sheet_names = xls.sheet_names
+            for sheet in sheet_names:
+                if read_cols:
+                    now_df = pd.read_excel(xls, sheet_name=sheet, header=header, usecols=read_cols)
+                else:
+                    now_df = pd.read_excel(xls, sheet_name=sheet, header=header)
+
+                df = pd.concat([df, now_df])
+
+        print('文件读取成功', file_path, '文件数量', df.shape)
+    except Exception as e:
+        print('读取文件出错', file_path, str(e))
+        message = '文件:' + path.basename(file_path) + ',' + str(e)
+        raise ValueError(message)
+
+    return df
+
+
+def __build_directory_dict(directory_dict, path, filter_types=None):
+    # 遍历目录下的所有项
+    for item in listdir(path):
+        item_path = path.join(path, item)
+        if path.isdir(item_path):
+            __build_directory_dict(directory_dict, item_path, filter_types=filter_types)
+        elif path.isfile(item_path):
+            if path not in directory_dict:
+                directory_dict[path] = []
+
+            if filter_types is None or len(filter_types) == 0:
+                directory_dict[path].append(item_path)
+            elif str(item_path).split(".")[-1] in filter_types:
+                if str(item_path).count("~$") == 0:
+                    directory_dict[path].append(item_path)
+
+    # 读取所有文件
+
+
+# 读取路径下所有的excel文件
+def read_excel_files(read_path):
+    directory_dict = {}
+    __build_directory_dict(directory_dict, read_path, filter_types=['xls', 'xlsx', 'csv', 'gz'])
+
+    return [path for paths in directory_dict.values() for path in paths if path]
+
+
+# 创建路径
+def create_file_path(path, is_file_path=False):
+    if is_file_path:
+        path = path.dirname(path)
+
+    if not path.exists(path):
+        makedirs(path, exist_ok=True)
+
+
+def time_biaozhun(df):
+    time_space = get_time_space(df, '时间')
+    query_df = df[['时间']]
+    query_df['时间'] = pd.to_datetime(df['时间'], errors="coerce")
+    query_df = query_df.dropna(subset=['时间'])
+    total = get_time_space_count(query_df['时间'].min(), query_df['时间'].max(), time_space)
+    return total, save_percent(1 - query_df.shape[0] / total), save_percent(1 - df.shape[0] / total)
+
+
+def save_percent(value, save_decimal=7):
+    return round(value, save_decimal) * 100
+
+
+def calc(df, file_name):
+    error_dict = {}
+    lose_dict = {}
+    error_dict['箱变'] = "".join(file_name.split(".")[:-1])
+    lose_dict['箱变'] = "".join(file_name.split(".")[:-1])
+
+    total, lose_time, error_time = time_biaozhun(df)
+    error_dict['时间'] = error_time
+    lose_dict['时间'] = lose_time
+
+    error_df = pd.DataFrame()
+    lose_df = pd.DataFrame()
+
+    try:
+        df.columns = ["".join(["逆变器" + "".join(col.split("逆变器")[1:])]) if col.find("逆变器") > -1 else col for col in
+                      df.columns]
+
+        for col in df.columns:
+            if col == '时间':
+                continue
+            query_df = df[[col]]
+            query_df[col] = pd.to_numeric(query_df[col], errors="coerce")
+            query_df = query_df.dropna(subset=[col])
+            lose_dict[col] = save_percent(1 - query_df.shape[0] / total)
+
+            if col.find('电压') > -1:
+                error_dict[col] = save_percent(query_df[query_df[col] < 0].shape[0] / total)
+
+            if col.find('电流') > -1:
+                error_dict[col] = save_percent(query_df[query_df[col] < -0.1].shape[0] / total)
+
+            if col.find('逆变器效率') > -1:
+                error_dict[col] = save_percent(query_df[(query_df[col] <= 0) | (query_df[col] >= 100)].shape[0] / total)
+
+            if col.find('温度') > -1:
+                error_dict[col] = save_percent(query_df[(query_df[col] < 0) | (query_df[col] > 100)].shape[0] / total)
+
+            if col.find('功率因数') > -1:
+                error_dict[col] = save_percent(query_df[(query_df[col] < 0) | (query_df[col] > 1)].shape[0] / total)
+
+        total, count = 0, 0
+        for k, v in error_dict.items():
+            if k != '箱变':
+                total = total + error_dict[k]
+                count = count + 1
+
+        error_dict['平均异常率'] = save_percent(total / count / 100)
+
+        total, count = 0, 0
+        for k, v in lose_dict.items():
+            if k != '箱变':
+                total = total + lose_dict[k]
+                count = count + 1
+
+        lose_dict['平均缺失率'] = save_percent(total / count / 100)
+
+        error_df = pd.concat([error_df, pd.DataFrame(error_dict, index=[0])])
+        lose_df = pd.concat([lose_df, pd.DataFrame(lose_dict, index=[0])])
+
+        error_df_cols = ['箱变', '平均异常率']
+        for col in error_df.columns:
+            if col not in error_df_cols:
+                error_df_cols.append(col)
+
+        lose_df_cols = ['箱变', '平均缺失率']
+        for col in lose_df.columns:
+            if col not in lose_df_cols:
+                lose_df_cols.append(col)
+
+        error_df = error_df[error_df_cols]
+        lose_df = lose_df[lose_df_cols]
+    except Exception as e:
+        print("异常文件", path.basename(file_name))
+        raise e
+
+    return error_df, lose_df
+
+
+def run(file_path):
+    df = read_file_to_df(file_path)
+    return calc(df, path.basename(file_path))
+
+
+if __name__ == '__main__':
+    # read_path = r'/data/download/大唐玉湖性能分析离线分析/05整理数据/逆变器数据'
+    # save_path = r'/data/download/大唐玉湖性能分析离线分析/06整理数据/逆变器数据'
+
+    read_path = r'D:\trans_data\大唐玉湖性能分析离线分析\test\yuanshi'
+    save_path = r'D:\trans_data\大唐玉湖性能分析离线分析\test\zhengli'
+    all_files = read_excel_files(read_path)
+
+    with multiprocessing.Pool(2) as pool:
+        df_arrys = pool.starmap(run, [(file,) for file in all_files])
+
+    error_df = pd.concat([df[0] for df in df_arrys])
+    lose_df = pd.concat([df[1] for df in df_arrys])
+    with pd.ExcelWriter(path.join(save_path, "玉湖光伏数据统计.xlsx")) as writer:
+        error_df.to_excel(writer, sheet_name='error_percent', index=False)
+        lose_df.to_excel(writer, sheet_name='lose_percent', index=False)

+ 134 - 0
tmp_file/qinghai-nuomuhong-guifan.py

@@ -0,0 +1,134 @@
+# -*- coding: utf-8 -*-
+"""
+Spyder 编辑器
+
+这是一个临时脚本文件。
+"""
+import datetime
+import multiprocessing
+from os import *
+
+import numpy as np
+import pandas as pd
+
+dianjian_str = """
+wind_turbine_number		
+time_stamp		时间
+active_power		有功功率 kW
+rotor_speed		风轮转速 rpm
+generator_speed		发电机转速 rpm
+wind_velocity		风速 m/s
+pitch_angle_blade_1		叶片1角度 °
+pitch_angle_blade_2		叶片2角度 °
+pitch_angle_blade_3		叶片3角度 °
+cabin_position		机舱位置 °
+true_wind_direction		
+yaw_error1		风向 °
+twisted_cable_angle		
+main_bearing_temperature		主轴温度 ℃
+gearbox_oil_temperature		齿轮箱温度 ℃
+gearbox_low_speed_shaft_bearing_temperature		齿轮箱轴承温度 ℃
+gearboxmedium_speed_shaftbearing_temperature		
+gearbox_high_speed_shaft_bearing_temperature		齿轮箱轴承温度2 ℃
+generatordrive_end_bearing_temperature		发电机驱动侧轴承温度 ℃
+generatornon_drive_end_bearing_temperature		发电机非驱动侧轴承温度 ℃
+cabin_temperature		机舱温度 ℃
+outside_cabin_temperature		舱外温度 ℃
+generator_winding1_temperature		
+generator_winding2_temperature		
+generator_winding3_temperature		
+front_back_vibration_of_the_cabin		
+side_to_side_vibration_of_the_cabin		
+required_gearbox_speed		
+inverter_speed_master_control		
+actual_torque		
+given_torque		
+clockwise_yaw_count		
+counterclockwise_yaw_count		
+unusable		
+power_curve_available		
+set_value_of_active_power		有功功率设定 kW
+wind_turbine_status		
+wind_turbine_status2		
+turbulence_intensity		
+"""
+
+datas = [i for i in dianjian_str.split("\n") if i]
+
+dianjian_dict = dict()
+
+for data in datas:
+    ds = data.split("\t")
+
+    if len(ds) == 3:
+        dianjian_dict[ds[0]] = ds[2]
+    else:
+        dianjian_dict[ds[0]] = ''
+
+
+def read_df(file_path):
+    df = pd.read_csv(file_path, header=[0, 1])
+
+    col_nams_map = dict()
+    pre_col = ""
+    for tuple_col in df.columns:
+        col1 = tuple_col[0]
+        col2 = tuple_col[1]
+        if str(col1).startswith("Unnamed"):
+            if pre_col:
+                col1 = pre_col
+                pre_col = ''
+            else:
+                col1 = ''
+        else:
+            pre_col = col1
+
+        if str(col2).startswith("Unnamed"):
+            col2 = ''
+
+        col_nams_map[str(tuple_col)] = ''.join([col1, col2])
+    # print(col_nams_map)
+    # for k, v in col_nams_map.items():
+    #     if str(v).endswith('采样值'):
+    #         col_nams_map[k] = str(v)[:-3]
+
+    df.columns = [str(col) for col in df.columns]
+    df.rename(columns=col_nams_map, inplace=True)
+
+    # for col, name in dianjian_dict.items():
+    #     if name in df.columns:
+    #         df.rename(columns={name: col}, inplace=True)
+
+    # for col in df.columns:
+    #     if col not in dianjian_dict.keys():
+    #         del df[col]
+
+    return df
+
+
+def get_wind_name_files(path):
+    files = listdir(path)
+    return files
+
+
+def combine_df(save_path, file):
+    begin = datetime.datetime.now()
+    df = read_df(file)
+    print("读取", file, df.shape)
+    df.replace("-", np.nan,inplace=True)
+    df.to_csv(path.join(save_path, path.basename(file)), encoding='utf-8', index=False)
+
+    print('整理完成', '耗时:', (datetime.datetime.now() - begin).seconds)
+
+
+if __name__ == '__main__':
+    read_path = r'/data/download/collection_data/1进行中/诺木洪风电场-甘肃-华电/收资数据/min-666'
+    save_path = r'/data/download/collection_data/1进行中/诺木洪风电场-甘肃-华电/清理数据/min-666'
+
+    # read_path = r'D:\trans_data\诺木洪\收资数据\min-666'
+    # save_path = r'D:\trans_data\诺木洪\清理数据\min-666'
+    if not path.exists(save_path):
+        makedirs(save_path, exist_ok=True)
+
+    with multiprocessing.Pool(20) as pool:
+        pool.starmap(combine_df, [(save_path, read_path + sep + file) for file in listdir(read_path)])

+ 162 - 0
tmp_file/qinghai-nuomuhong.py

@@ -0,0 +1,162 @@
+# -*- coding: utf-8 -*-
+"""
+Spyder 编辑器
+
+这是一个临时脚本文件。
+"""
+import copy
+import datetime
+import multiprocessing
+from os import *
+
+import numpy as np
+import pandas as pd
+
+dianjian_str = """
+wind_turbine_number		
+time_stamp		时间
+active_power		有功功率 kW
+rotor_speed		风轮转速 rpm
+generator_speed		发电机转速 rpm
+wind_velocity		风速 m/s
+pitch_angle_blade_1		叶片1角度 °
+pitch_angle_blade_2		叶片2角度 °
+pitch_angle_blade_3		叶片3角度 °
+cabin_position		机舱位置 °
+true_wind_direction		
+yaw_error1		风向 °
+twisted_cable_angle		
+main_bearing_temperature		主轴温度 ℃
+gearbox_oil_temperature		齿轮箱温度 ℃
+gearbox_low_speed_shaft_bearing_temperature		齿轮箱轴承温度 ℃
+gearboxmedium_speed_shaftbearing_temperature		
+gearbox_high_speed_shaft_bearing_temperature		齿轮箱轴承温度2 ℃
+generatordrive_end_bearing_temperature		发电机驱动侧轴承温度 ℃
+generatornon_drive_end_bearing_temperature		发电机非驱动侧轴承温度 ℃
+cabin_temperature		机舱温度 ℃
+outside_cabin_temperature		舱外温度 ℃
+generator_winding1_temperature		
+generator_winding2_temperature		
+generator_winding3_temperature		
+front_back_vibration_of_the_cabin		
+side_to_side_vibration_of_the_cabin		
+required_gearbox_speed		
+inverter_speed_master_control		
+actual_torque		
+given_torque		
+clockwise_yaw_count		
+counterclockwise_yaw_count		
+unusable		
+power_curve_available		
+set_value_of_active_power		有功功率设定 kW
+wind_turbine_status		
+wind_turbine_status2		
+turbulence_intensity		
+"""
+
+datas = [i for i in dianjian_str.split("\n") if i]
+
+dianjian_dict = dict()
+
+for data in datas:
+    ds = data.split("\t")
+
+    if len(ds) == 3:
+        dianjian_dict[ds[0]] = ds[2]
+    else:
+        dianjian_dict[ds[0]] = ''
+
+
+def read_df(file_path):
+    df = pd.read_csv(file_path, header=[0, 1])
+
+    col_nams_map = dict()
+    pre_col = ""
+    for tuple_col in df.columns:
+        col1 = tuple_col[0]
+        col2 = tuple_col[1]
+        if str(col1).startswith("Unnamed"):
+            if pre_col:
+                col1 = pre_col
+                pre_col = ''
+            else:
+                col1 = ''
+        else:
+            pre_col = col1
+
+        if str(col2).startswith("Unnamed"):
+            col2 = ''
+
+        col_nams_map[str(tuple_col)] = ''.join([col1, col2])
+    print(col_nams_map)
+    for k, v in col_nams_map.items():
+        if str(v).endswith('采样值'):
+            col_nams_map[k] = str(v)[:-3]
+
+    df.columns = [str(col) for col in df.columns]
+    df.rename(columns=col_nams_map, inplace=True)
+
+    for col, name in dianjian_dict.items():
+        if name in df.columns:
+            df.rename(columns={name: col}, inplace=True)
+
+    for col in df.columns:
+        if col not in dianjian_dict.keys():
+            del df[col]
+
+    return df
+
+
+def get_wind_name_files(path):
+    files = listdir(path)
+
+    wind_files_map = dict()
+    for file in files:
+        full_file = path.join(path, file)
+        file_datas = str(file).split("@")
+        key = file_datas[0].replace("HD", "HD2")
+        if key in wind_files_map.keys():
+            wind_files_map[key].append(full_file)
+        else:
+            wind_files_map[key] = [full_file]
+
+    return wind_files_map
+
+
+def combine_df(save_path, wind_name, files):
+    begin = datetime.datetime.now()
+    df = pd.DataFrame()
+    for file in files:
+        query_df = read_df(file)
+        print("读取", file, query_df.shape)
+        query_df['time_stamp'] = pd.to_datetime(query_df['time_stamp'])
+        query_df.set_index(keys='time_stamp', inplace=True)
+        query_df = query_df[~query_df.index.duplicated(keep='first')]
+        if df.empty:
+            df = copy.deepcopy(query_df)
+        else:
+            df = pd.concat([df, query_df], join='inner')
+    df.reset_index(inplace=True)
+    df['wind_turbine_number'] = wind_name
+    for col, name in dianjian_dict.items():
+        if col not in df.columns:
+            df[col] = np.nan
+
+    df = df[dianjian_dict.keys()]
+    df.to_csv(path.join(save_path, wind_name + ".csv"), encoding='utf-8', index=False)
+
+    print(wind_name, '整理完成', '耗时:', (datetime.datetime.now() - begin).seconds)
+
+
+if __name__ == '__main__':
+    read_path = r'/data/download/collection_data/1进行中/诺木洪风电场-甘肃-华电/收资数据/sec'
+    save_path = r'/data/download/collection_data/1进行中/诺木洪风电场-甘肃-华电/收资数据/sec_采样值'
+
+    # read_path = r'D:\trans_data\诺木洪\收资数据\min'
+    # save_path = r'D:\trans_data\诺木洪\清理数据\min'
+    if not path.exists(save_path):
+        makedirs(save_path, exist_ok=True)
+    wind_files_map = get_wind_name_files(read_path)
+
+    with multiprocessing.Pool(20) as pool:
+        pool.starmap(combine_df, [(save_path, wind_name, files) for wind_name, files in wind_files_map.items()])

+ 208 - 0
tmp_file/qitaihe_biaozhunhua.py

@@ -0,0 +1,208 @@
+import datetime
+import multiprocessing
+import os
+import sys
+
+sys.path.insert(0, os.path.abspath(__file__).split("tmp_file")[0])
+
+import pandas as pd
+
+from utils.file.trans_methods import read_file_to_df, read_excel_files
+
+
+def get_time_space_count(start_time, end_time, time_space=1):
+    """
+    获取俩个时间之间的个数
+    :return: 查询时间间隔
+    """
+
+    if isinstance(start_time, str):
+        start_time = datetime.datetime.strptime(start_time, '%Y-%m-%d %H:%M:%S')
+
+    if isinstance(end_time, str):
+        end_time = datetime.datetime.strptime(end_time, '%Y-%m-%d %H:%M:%S')
+
+    delta = end_time - start_time
+    total_seconds = delta.days * 24 * 60 * 60 + delta.seconds
+
+    return abs(int(total_seconds / time_space)) + 1
+
+
+def save_percent(value, save_decimal=7):
+    return round(value, save_decimal) * 100
+
+
+def read_and_select(file):
+    result_df = pd.DataFrame()
+    # wind_name = os.path.basename(file_path).split('.')[0]
+    df = pd.read_csv(file)
+    df['systime'] = pd.to_datetime(df['systime'], errors='coerce')
+    # condation1 = (df[df['systime'] >= '2024-11-12 00:00:00']) & (df[df['systime'] <= '2024-11-19 12:15:35'])
+    # condation2 = (df[df['systime'] >= '2024-12-02 00:00:00']) & (df[df['systime'] <= '2024-12-31 23:59:55'])
+    # condation3 = (df[df['systime'] >= '2025-01-01 00:00:00']) & (df[df['systime'] <= '2025-01-21 23:59:55'])
+    # condation4 = (df[df['systime'] >= '2025-01-31 00:00:00']) & (df[df['systime'] <= '2025-02-04 23:59:55'])
+    #
+    # condation = condation1 | condation2 | condation3 | condation4
+    #
+    # df = df[condation]
+
+    read_cols = list(df.columns)
+    read_cols.remove('systime')
+    read_cols.remove('wecnum')
+
+    wind_name = os.path.basename(file).replace('.csv', '')
+    result_df['wecnum'] = [wind_name]
+    # df = df.query("(Time>='2024-06-01 00:00:00') & (Time<'2024-12-01 00:00:00')")
+
+    count1 = get_time_space_count('2024-11-12 00:00:00', '2024-11-19 12:15:35', 5)
+    count2 = get_time_space_count('2024-12-02 00:00:00', '2024-12-31 23:59:55', 5)
+    count3 = get_time_space_count('2025-01-01 00:00:00', '2025-01-21 23:59:55', 5)
+    count4 = get_time_space_count('2025-01-31 00:00:00', '2025-02-04 23:59:55', 5)
+
+    count = sum([count1, count2, count3, count4])
+
+    print(df['systime'].min(), df['systime'].max(), count)
+    repeat_time_count = df.shape[0] - len(df['systime'].unique())
+    print(wind_name, count, repeat_time_count)
+    result_df['重复率'] = [save_percent(repeat_time_count / count)]
+    result_df['重复次数'] = [repeat_time_count]
+    result_df['总记录数'] = [count]
+
+    for read_col in read_cols:
+
+        if read_col not in ['systime', 'plcvernew', 'dmsver', 'scadaver', 'collectime']:
+            df[read_col] = pd.to_numeric(df[read_col], errors='coerce')
+
+    group_df = df.groupby(by=['wecnum']).count()
+    group_df.reset_index(inplace=True)
+    count_df = pd.DataFrame(group_df)
+    total_count = count_df[read_cols].values[0].sum()
+    print(wind_name, total_count, count * len(read_cols))
+    result_df['平均缺失率,单位%'] = [save_percent(1 - total_count / (count * len(read_cols)))]
+    # result_df['缺失数值'] = [
+    #     '-'.join(
+    #         [str(read_cols[index]) + ':' + str(count - i) for index, i in enumerate(count_df[read_cols].values[0])])]
+    del group_df
+
+    fengsu_count = 0
+    fengsu_cols = ['iwinfil']
+    fengsu_str = ''
+    for col in fengsu_cols:
+        now_count = df[(df[col] < 0) | (df[col] > 80)].shape[0]
+        fengsu_count = fengsu_count + now_count
+        fengsu_str = fengsu_str + ',' + col + ':' + str(fengsu_count)
+    result_df['风速异常'] = [fengsu_str]
+
+    gonglv_cols = ['power']
+    gonglv_count = 0
+    gonglv_str = ''
+    for col in gonglv_cols:
+        now_count = df[(df[col] < -200) | (df[col] > 3000)].shape[0]
+        gonglv_count = gonglv_count + now_count
+        gonglv_str = gonglv_str + ',' + col + ':' + str(gonglv_count)
+    result_df['功率异常'] = [gonglv_str]
+
+    result_df['平均异常率'] = [
+        save_percent((fengsu_count + fengsu_count) / ((len(fengsu_cols) + len(gonglv_cols)) * count))]
+
+    return result_df
+
+
+def save_to_csv(df: pd.DataFrame, path):
+    df.to_csv(path, encoding='utf8', index=False)
+
+
+def read_and_select_time(file):
+    df = pd.read_csv(file, usecols=['collectime'])
+    df['collectime'] = pd.to_datetime(df['collectime'])
+
+    df1 = df[(df['collectime'] >= '2024-11-12 00:00:00') & (df['collectime'] <= '2024-11-19 23:59:59')]
+    df2 = df[(df['collectime'] >= '2024-12-02 00:00:00') & (df['collectime'] <= '2024-12-31 23:59:59')]
+    df3 = df[(df['collectime'] >= '2025-01-01 00:00:00') & (df['collectime'] <= '2025-01-21 23:59:59')]
+    df4 = df[(df['collectime'] >= '2025-01-31 00:00:00') & (df['collectime'] <= '2025-02-04 23:59:59')]
+
+    return [(df1['collectime'].min(), df1['collectime'].max()), (df2['collectime'].min(), df2['collectime'].max()),
+            (df3['collectime'].min(), df3['collectime'].max()), (df4['collectime'].min(), df4['collectime'].max())]
+
+
+if __name__ == '__main__':
+    # read_cols = ['Time', '设备主要状态', '功率曲线风速', '湍流强度', '实际风速', '有功功率', '桨叶角度A', '桨叶角度B',
+    #              '桨叶角度C', '机舱内温度', '机舱外温度', '绝对风向', '机舱绝对位置', '叶轮转速', '发电机转速',
+    #              '瞬时风速',
+    #              '有功设定反馈', '当前理论可发最大功率', '空气密度', '偏航误差', '发电机扭矩', '瞬时功率', '风向1s',
+    #              '偏航压力', '桨叶1速度', '桨叶2速度', '桨叶3速度', '桨叶1角度给定', '桨叶2角度给定', '桨叶3角度给定',
+    #              '轴1电机电流', '轴2电机电流', '轴3电机电流', '轴1电机温度', '轴2电机温度', '轴3电机温度', '待机',
+    #              '启动',
+    #              '偏航', '并网', '限功率', '正常发电', '故障', '计入功率曲线', '运行发电机冷却风扇1',
+    #              '运行发电机冷却风扇2',
+    #              '激活偏航解缆阀', '激活偏航刹车阀', '激活风轮刹车阀', '激活顺时针偏航', '激活逆时针偏航', '电缆扭角']
+
+    # select_cols = ['wecnum', 'systime', 'power', 'iwinfil', 'hubpos1', 'hubpos2', 'hubpos3', 'windir']
+
+    # read_dir = r'/data/download/collection_data/1进行中/七台河风电场-黑龙江-华电/收资数据/七台河/秒级数据/sec'
+    # files = read_excel_files(read_dir)
+    # dfs = list()
+    # with multiprocessing.Pool(33) as pool:
+    #     dfs = pool.map(read_file_to_df, files)
+    # df = pd.concat(dfs, ignore_index=True)
+    # print(df.columns)
+    # df['systime'] = pd.to_datetime(df['systime'], errors='coerce')
+    # df['wecnum'] = pd.to_numeric(df['wecnum'], errors='coerce')
+    # read_cols = list(df.columns)
+    # read_cols.remove('systime')
+    # read_cols.remove('wecnum')
+    #
+    # wind_names = df['wecnum'].unique()
+    tmp_save_dir = r'/home/wzl/test_data/qitaihe/sec'
+    # with multiprocessing.Pool(4) as pool:
+    #     pool.starmap(save_to_csv,
+    #                  [(df[df['wecnum'] == wind_name], os.path.join(tmp_save_dir, str(wind_name) + '.csv')) for wind_name
+    #                   in
+    #                   wind_names])
+    #
+    # del df
+    all_fils = read_excel_files(tmp_save_dir)
+
+    with multiprocessing.Pool(10) as pool:
+        dfs = pool.starmap(read_and_select,
+                           [(file,) for file in all_fils])
+
+    resu_df = pd.concat(dfs, ignore_index=True)
+    print(resu_df.columns)
+    resu_df.sort_values(by=['wecnum'], inplace=True)
+    resu_df.to_csv("七台河-5秒.csv", encoding='utf8', index=False)
+
+    # with multiprocessing.Pool(10) as pool:
+    #     datas = pool.map(read_and_select_time, all_fils)
+    #
+    # min1 = list()
+    # max1 = list()
+    #
+    # min2 = list()
+    # max2 = list()
+    #
+    # min3 = list()
+    # max3 = list()
+    #
+    # min4 = list()
+    # max4 = list()
+    #
+    # for data in datas:
+    #     print(data)
+    #     data1, data2, data3, data4 = data[0], data[1], data[2], data[3]
+    #     min1.append(data1[0])
+    #     max1.append(data1[1])
+    #
+    #     min2.append(data2[0])
+    #     max2.append(data2[1])
+    #
+    #     min3.append(data3[0])
+    #     max3.append(data3[1])
+    #
+    #     min4.append(data4[0])
+    #     max4.append(data4[1])
+    #
+    # print(min(min1), max(max1))
+    # print(min(min2), max(max2))
+    # print(min(min3), max(max3))
+    # print(min(min4), max(max4))

+ 139 - 0
tmp_file/qitaihe_biaozhunhua_minute.py

@@ -0,0 +1,139 @@
+import datetime
+import multiprocessing
+import os
+import sys
+
+sys.path.insert(0, os.path.abspath(__file__).split("tmp_file")[0])
+
+import pandas as pd
+
+from utils.file.trans_methods import read_file_to_df, read_excel_files
+
+
+def get_time_space_count(start_time: datetime.datetime, end_time: datetime.datetime, time_space=1):
+    """
+    获取俩个时间之间的个数
+    :return: 查询时间间隔
+    """
+    delta = end_time - start_time
+    total_seconds = delta.days * 24 * 60 * 60 + delta.seconds
+
+    return abs(int(total_seconds / time_space)) + 1
+
+
+def save_percent(value, save_decimal=7):
+    return round(value, save_decimal) * 100
+
+
+def read_and_select(file):
+    result_df = pd.DataFrame()
+    # wind_name = os.path.basename(file_path).split('.')[0]
+    df = read_file_to_df(file)
+    read_cols = list(df.columns)
+    read_cols.remove('系统时间')
+    read_cols.remove('风机号')
+
+    wind_name = os.path.basename(file).replace('.csv', '')
+    result_df['风机号'] = [wind_name]
+    # df = df.query("(Time>='2024-06-01 00:00:00') & (Time<'2024-12-01 00:00:00')")
+    df['系统时间'] = pd.to_datetime(df['系统时间'], errors='coerce')
+
+    df = df[df['系统时间'] <= '2024-11-20 09:10:00']
+
+    # count = get_time_space_count(df['系统时间'].min(), df['系统时间'].max(), 600)
+    count = 59959
+    print(df['系统时间'].min(), df['系统时间'].max(), count)
+    repeat_time_count = df.shape[0] - len(df['系统时间'].unique())
+    print(wind_name, count, repeat_time_count)
+    result_df['重复率'] = [save_percent(repeat_time_count / count)]
+    result_df['重复次数'] = [repeat_time_count]
+    result_df['总记录数'] = [count]
+    result_df['数据条数'] = [df.shape[0]]
+
+    for read_col in read_cols:
+
+        if read_col not in ['系统时间']:
+            df[read_col] = pd.to_numeric(df[read_col], errors='coerce')
+
+    group_df = df.groupby(by=['风机号']).count()
+    group_df.reset_index(inplace=True)
+    count_df = pd.DataFrame(group_df)
+    total_count = count_df[read_cols].values[0].sum()
+    print(wind_name, total_count, count * len(read_cols))
+    result_df['平均缺失率,单位%'] = [save_percent(1 - total_count / (count * len(read_cols)))]
+    # result_df['缺失数值'] = [
+    #     '-'.join(
+    #         [str(read_cols[index]) + ':' + str(count - i) for index, i in enumerate(count_df[read_cols].values[0])])]
+    del group_df
+
+    fengsu_count = 0
+    fengsu_cols = ['1秒平均风速[m/s]']
+    fengsu_str = ''
+    for col in fengsu_cols:
+        now_count = df[(df[col] < 0) | (df[col] > 80)].shape[0]
+        fengsu_count = fengsu_count + now_count
+        fengsu_str = fengsu_str + ',' + col + ':' + str(fengsu_count)
+    result_df['风速异常'] = [fengsu_str]
+
+    gonglv_cols = ['有功功率[kW]']
+    gonglv_count = 0
+    gonglv_str = ''
+    for col in gonglv_cols:
+        now_count = df[(df[col] < -200) | (df[col] > 3000)].shape[0]
+        gonglv_count = gonglv_count + now_count
+        gonglv_str = gonglv_str + ',' + col + ':' + str(gonglv_count)
+    result_df['功率异常'] = [gonglv_str]
+
+    result_df['平均异常率'] = [
+        save_percent((fengsu_count + fengsu_count) / ((len(fengsu_cols) + len(gonglv_cols)) * count))]
+
+    return result_df
+
+
+def save_to_csv(df: pd.DataFrame, path):
+    df.to_csv(path, encoding='utf8', index=False)
+
+
+if __name__ == '__main__':
+    # read_cols = ['Time', '设备主要状态', '功率曲线风速', '湍流强度', '实际风速', '有功功率', '桨叶角度A', '桨叶角度B',
+    #              '桨叶角度C', '机舱内温度', '机舱外温度', '绝对风向', '机舱绝对位置', '叶轮转速', '发电机转速',
+    #              '瞬时风速',
+    #              '有功设定反馈', '当前理论可发最大功率', '空气密度', '偏航误差', '发电机扭矩', '瞬时功率', '风向1s',
+    #              '偏航压力', '桨叶1速度', '桨叶2速度', '桨叶3速度', '桨叶1角度给定', '桨叶2角度给定', '桨叶3角度给定',
+    #              '轴1电机电流', '轴2电机电流', '轴3电机电流', '轴1电机温度', '轴2电机温度', '轴3电机温度', '待机',
+    #              '启动',
+    #              '偏航', '并网', '限功率', '正常发电', '故障', '计入功率曲线', '运行发电机冷却风扇1',
+    #              '运行发电机冷却风扇2',
+    #              '激活偏航解缆阀', '激活偏航刹车阀', '激活风轮刹车阀', '激活顺时针偏航', '激活逆时针偏航', '电缆扭角']
+
+    # select_cols = ['风机号', '系统时间', 'power', 'iwinfil', 'hubpos1', 'hubpos2', 'hubpos3', 'windir']
+
+    # read_dir = r'D:\data\shouzi\qitaihe\十分钟数据'
+    # files = read_excel_files(read_dir)
+    # dfs = list()
+    # with multiprocessing.Pool(len(files)) as pool:
+    #     dfs = pool.map(read_file_to_df, files)
+    # df = pd.concat(dfs, ignore_index=True)
+    # print(df.columns)
+    # df['系统时间'] = pd.to_datetime(df['系统时间'], errors='coerce')
+    # df['风机号'] = pd.to_numeric(df['sheet_name'], errors='coerce')
+    # del df['sheet_name']
+    #
+    # wind_names = df['风机号'].unique()
+    tmp_save_dir = r'D:\data\shouzi\qitaihe\tmp'
+    # with multiprocessing.Pool(4) as pool:
+    #     pool.starmap(save_to_csv,
+    #                  [(df[df['风机号'] == wind_name], os.path.join(tmp_save_dir, str(wind_name) + '.csv')) for wind_name
+    #                   in
+    #                   wind_names])
+    #
+    # del df
+    all_fils = read_excel_files(tmp_save_dir)
+    with multiprocessing.Pool(10) as pool:
+        dfs = pool.starmap(read_and_select,
+                           [(file,) for file in all_fils])
+
+    resu_df = pd.concat(dfs, ignore_index=True)
+    print(resu_df.columns)
+    resu_df.sort_values(by=['风机号'], inplace=True)
+    resu_df.to_csv("七台河-10分.csv", encoding='utf8', index=False)

+ 38 - 0
tmp_file/queshi_bili.py

@@ -0,0 +1,38 @@
+import datetime
+
+import pandas as pd
+
+
+def get_time_space_count(start_time: datetime.datetime, end_time: datetime.datetime, time_space=1):
+    """
+    获取俩个时间之间的个数
+    :return: 查询时间间隔
+    """
+    delta = end_time - start_time
+    total_seconds = delta.days * 24 * 60 * 60 + delta.seconds
+
+    return abs(int(total_seconds / time_space))
+
+
+df = pd.read_csv("D:\data\白玉山后评估数据资料\十分钟.csv", encoding='utf8')
+
+df['时间'] = pd.to_datetime(df['时间'])
+df['plus_10min'] = df['时间'] + pd.Timedelta(minutes=10)
+
+names = list(set(df['设备名称']))
+names.sort()
+
+count = get_time_space_count(datetime.datetime.strptime('2023-09-01 00:00:00', '%Y-%m-%d %H:%M:%S'),
+                             datetime.datetime.strptime('2024-09-01 00:00:00', '%Y-%m-%d %H:%M:%S'), 600)
+
+result_df = pd.DataFrame(df['设备名称'].value_counts())
+result_df.reset_index(inplace=True)
+result_df.columns = ['风机', '数量']
+
+result_df['总数'] = count
+
+result_df['完整度'] = result_df['数量'].apply(lambda x: round(x * 100 / count, 2))
+
+result_df.sort_values(by=['风机'], inplace=True)
+
+print(result_df)

+ 42 - 0
tmp_file/read_and_draw_png.py

@@ -0,0 +1,42 @@
+import multiprocessing
+from os import *
+
+from etl.wind_power.min_sec.ClassIdentifier import ClassIdentifier
+from utils.draw.draw_file import scatter
+from utils.file.trans_methods import read_file_to_df
+
+
+def draw(file, fengchang='测试'):
+    name = path.basename(file).split('.')[0]
+    df = read_file_to_df(file)
+    del df['lab']
+    identifier = ClassIdentifier(wind_turbine_number='test', origin_df=df, rated_power=5000, cut_out_speed=20,
+                                 active_power='active_power', wind_velocity='wind_velocity',
+                                 pitch_angle_blade='pitch_angle_blade_1')
+    df = identifier.run()
+
+    df.loc[df['active_power'] <= 0, 'lab'] = -1
+
+    print(df.groupby('lab').count())
+    color_map = {-1: 'red', 0: 'green', 1: 'blue', 2: 'black', 3: 'orange', 4: 'magenta'}
+    c = df['lab'].map(color_map)
+
+    # -1:停机 0:好点  1:欠发功率点;2:超发功率点;3:额定风速以上的超发功率点 4: 限电
+    legend_map = {"停机": 'red', "好点": 'green', "欠发": 'blue', "超发": 'black', "额定风速以上的超发": 'orange',
+                  "限电": 'magenta'}
+    scatter(name, x_label='风速', y_label='有功功率', x_values=df['wind_velocity'].values,
+            y_values=df['active_power'].values, color=c, col_map=legend_map,
+            save_file_path=path.dirname(
+                path.dirname(__file__)) + sep + "tmp" + sep + str(fengchang) + sep + name + '结果.png')
+
+
+if __name__ == '__main__':
+    read_dir = r"D:\data\logs\matlib-test"
+
+    files = [read_dir + sep + i for i in listdir(read_dir)]
+
+    if len(files) == 1:
+        draw(files[0], "和风元宝山4")
+    else:
+        with multiprocessing.Pool(4) as pool:
+            pool.starmap(draw, [(file, "和风元宝山4") for file in files])

+ 27 - 0
tmp_file/select_part_cols.py

@@ -0,0 +1,27 @@
+import datetime
+import multiprocessing
+from os import *
+
+import pandas as pd
+
+read_dir = r'/data/download/collection_data/2完成/招远风电场-山东-大唐/清理数据/WOF01000010-WOB000002_ZY24年7-9月秒级/second'
+save_dir = r'/data/download/collection_data/2完成/招远风电场-山东-大唐/清理数据/WOF01000010-WOB000002_ZY24年7-9月秒级/second_select_yaw_error1_20241014'
+
+
+def read_and_select_and_save(file):
+    df = pd.read_csv(read_dir + sep + file,
+                     usecols=['active_power', 'wind_velocity', 'pitch_angle_blade_1', 'yaw_error1', 'lab'])
+    df = df[df['yaw_error1'] <= 360]
+    df['yaw_error1'] = df['yaw_error1'].apply(lambda x: x - 360 if 180 <= x <= 360 else x)
+    condition = (df['active_power'] > 0) & (df['wind_velocity'] > 0)
+    df = df[condition]
+
+    df.to_csv(path.join(save_dir, file), index=False, encoding='utf8')
+    print(f'{file}处理完成')
+
+
+if __name__ == '__main__':
+    begin = datetime.datetime.now()
+    with multiprocessing.Pool(32) as pool:
+        pool.starmap(read_and_select_and_save, [(file,) for file in listdir(read_dir)])
+    print(f'总耗时:{datetime.datetime.now() - begin}')

+ 114 - 0
tmp_file/taipingli_biaozhunhua.py

@@ -0,0 +1,114 @@
+import datetime
+import multiprocessing
+import os
+import sys
+
+sys.path.insert(0, os.path.abspath(__file__).split("tmp_file")[0])
+
+import pandas as pd
+
+from utils.file.trans_methods import read_file_to_df
+
+
+def get_time_space_count(start_time: datetime.datetime, end_time: datetime.datetime, time_space=1):
+    """
+    获取俩个时间之间的个数
+    :return: 查询时间间隔
+    """
+    delta = end_time - start_time
+    total_seconds = delta.days * 24 * 60 * 60 + delta.seconds
+
+    return abs(int(total_seconds / time_space)) + 1
+
+
+def save_percent(value, save_decimal=7):
+    return round(value, save_decimal) * 100
+
+
+def read_and_select(df, wind_name, read_cols):
+    result_df = pd.DataFrame()
+    # wind_name = os.path.basename(file_path).split('.')[0]
+    result_df['风机号'] = [wind_name]
+    # df = df.query("(Time>='2024-06-01 00:00:00') & (Time<'2024-12-01 00:00:00')")
+    count = get_time_space_count(df['Time'].min(), df['Time'].max(), 1)
+    print(df['Time'].min(), df['Time'].max(), count)
+    repeat_time_count = df.shape[0] - len(df['Time'].unique())
+    print(wind_name, count, repeat_time_count)
+    result_df['重复率'] = [save_percent(repeat_time_count / count)]
+    result_df['重复次数'] = [repeat_time_count]
+    result_df['总记录数'] = [count]
+
+    for read_col in read_cols:
+
+        if read_col != 'Time':
+            df[read_col] = pd.to_numeric(df[read_col], errors='coerce')
+
+    group_df = df.groupby(by=['风机号']).count()
+    group_df.reset_index(inplace=True)
+    count_df = pd.DataFrame(group_df)
+    total_count = count_df[read_cols].values[0].sum()
+    print(wind_name, total_count, count * len(read_cols))
+    result_df['平均缺失率,单位%'] = [save_percent(1 - total_count / (count * len(read_cols)))]
+    result_df['缺失数值'] = [
+        '-'.join(
+            [str(read_cols[index]) + ':' + str(count - i) for index, i in enumerate(count_df[read_cols].values[0])])]
+    del group_df
+
+    fengsu_count = 0
+    fengsu_cols = [i for i in read_cols if i.find('风速') > -1]
+    fengsu_str = ''
+    for col in fengsu_cols:
+        now_count = df[(df[col] < 0) | (df[col] > 80)].shape[0]
+        fengsu_count = fengsu_count + now_count
+        fengsu_str = fengsu_str + ',' + col + ':' + str(fengsu_count)
+    result_df['风速异常'] = [fengsu_str]
+
+    gonglv_cols = ['有功功率(kW)', '风机出口有功功率(kW)']
+    gonglv_count = 0
+    gonglv_str = ''
+    for col in gonglv_cols:
+        now_count = df[(df[col] < -200) | (df[col] > 3000)].shape[0]
+        gonglv_count = gonglv_count + now_count
+        gonglv_str = gonglv_str + ',' + col + ':' + str(gonglv_count)
+    result_df['功率异常'] = [gonglv_str]
+
+    result_df['平均异常率'] = [
+        save_percent((fengsu_count + fengsu_count) / ((len(fengsu_cols) + len(gonglv_cols)) * count))]
+
+    return result_df
+
+
+if __name__ == '__main__':
+    # read_cols = ['Time', '设备主要状态', '功率曲线风速', '湍流强度', '实际风速', '有功功率', '桨叶角度A', '桨叶角度B',
+    #              '桨叶角度C', '机舱内温度', '机舱外温度', '绝对风向', '机舱绝对位置', '叶轮转速', '发电机转速',
+    #              '瞬时风速',
+    #              '有功设定反馈', '当前理论可发最大功率', '空气密度', '偏航误差', '发电机扭矩', '瞬时功率', '风向1s',
+    #              '偏航压力', '桨叶1速度', '桨叶2速度', '桨叶3速度', '桨叶1角度给定', '桨叶2角度给定', '桨叶3角度给定',
+    #              '轴1电机电流', '轴2电机电流', '轴3电机电流', '轴1电机温度', '轴2电机温度', '轴3电机温度', '待机',
+    #              '启动',
+    #              '偏航', '并网', '限功率', '正常发电', '故障', '计入功率曲线', '运行发电机冷却风扇1',
+    #              '运行发电机冷却风扇2',
+    #              '激活偏航解缆阀', '激活偏航刹车阀', '激活风轮刹车阀', '激活顺时针偏航', '激活逆时针偏航', '电缆扭角']
+
+    read_dir = r'D:\data\tmp\sec.csv'
+    df = read_file_to_df(read_dir)
+    print(df.columns)
+    del df['Unnamed: 79']
+    df.rename(columns={'Unnamed: 0': 'Time'}, inplace=True)
+    print(df.columns)
+    df['Time'] = pd.to_datetime(df['Time'], errors='coerce')
+    df['风机号'] = pd.to_numeric(df['风机号'], errors='coerce')
+    df = df[df['风机号'].isin([i for i in range(1, 6)])]
+    read_cols = list(df.columns)
+    read_cols.remove('Time')
+    read_cols.remove('风机号')
+
+    wind_names = df['风机号'].unique()
+    with multiprocessing.Pool(5) as pool:
+        dfs = pool.starmap(read_and_select,
+                           [(df[df['风机号'] == wind_name], wind_name, read_cols) for wind_name in wind_names])
+
+    resu_df = pd.concat(dfs, ignore_index=True)
+    print(resu_df.columns)
+    resu_df.sort_values(by=['风机号'], inplace=True)
+    resu_df.to_csv("太平里-1秒.csv", encoding='utf8', index=False)

+ 19 - 0
tmp_file/test_wave.py

@@ -0,0 +1,19 @@
+import sys
+from os import *
+
+sys.path.insert(0, path.abspath(__file__).split("tmp_file")[0])
+
+
+if __name__ == '__main__':
+    env = 'dev'
+    if len(sys.argv) >= 2:
+        env = sys.argv[1]
+
+    conf_path = path.abspath(__file__).split("tmp_file")[0] + f"/conf/etl_config_{env}.yaml"
+    environ['ETL_CONF'] = conf_path
+    environ['env'] = env
+    from etl.wind_power.wave.WaveTrans import WaveTrans
+    test = WaveTrans('SKF001', r'/home/wzl/test_data/sdk_data/sdk_data', r'/home/wzl/test_data/sdk_data')
+    # test = WaveTrans('SKF001', r'D:\data\sdk_data\sdk_data_less', r'/home/wzl/test_data/sdk_data')
+
+    test.run()

+ 55 - 0
tmp_file/zibo_guzhang_select_time.py

@@ -0,0 +1,55 @@
+from datetime import datetime, timedelta
+
+from utils.file.trans_methods import *
+
+
+def convert_and_calculate_time_range(time_str):
+    # 解析原始字符串
+    date_part = time_str[:6]
+    time_part = time_str[7:]
+
+    # 将短日期格式转换为完整年份
+    year = '20' + date_part[:2]
+    month = date_part[2:4]
+    day = date_part[4:]
+
+    hour = time_part[:2]
+    minute = time_part[2:]
+
+    # 创建 datetime 对象
+    base_time = datetime.datetime.strptime(f"{year}-{month}-{day} {hour}:{minute}", "%Y-%m-%d %H:%M")
+
+    # 计算时间区间
+    start_time = base_time.replace(second=0, microsecond=0) - timedelta(minutes=2)
+    end_time = base_time.replace(second=0, microsecond=0) + timedelta(minutes=3)
+
+    return base_time.strftime("%Y-%m-%d %H:%M"), start_time.strftime("%Y-%m-%d %H:%M:%S"), end_time.strftime(
+        "%Y-%m-%d %H:%M:%S")
+
+
+all_df = read_file_to_df(r"D:\data\淄博\故障记录_filtered.csv")
+all_df['激活时间'] = pd.to_datetime(all_df['激活时间'])
+
+all_files = read_excel_files(r"D:\data\淄博\淄博风场buffer文件(1)")
+
+dfs = pd.DataFrame()
+
+for file in all_files:
+    base_name = path.basename(file)
+    if base_name.startswith("b"):
+        try:
+            turbnine_no = int(base_name.split("_")[0].replace("b", ""))
+            base_time, start_time, end_time = convert_and_calculate_time_range(
+                base_name.replace(base_name.split("_")[0] + "_", "")[0:11])
+        except Exception as e:
+            print("error:", file)
+            raise e
+
+        condation1 = (all_df['激活时间'] >= start_time) & (all_df['风机名'] == turbnine_no)
+        condation2 = (all_df['激活时间'] < end_time) & (all_df['风机名'] == turbnine_no)
+        condation = condation1 & condation2
+        dfs = pd.concat([dfs, all_df[condation]])
+
+dfs.drop_duplicates(inplace=True)
+
+dfs.to_csv(r"D:\data\淄博\result.csv", encoding='utf8', index=False)

+ 87 - 0
tmp_file/压缩内读取.py

@@ -0,0 +1,87 @@
+import zipfile
+import rarfile
+import pandas as pd
+from io import BytesIO
+import os
+from concurrent.futures import ThreadPoolExecutor, ProcessPoolExecutor, as_completed
+
+
+# 递归处理压缩文件或文件夹
+def process_compressed_file(file_path, file_name=None):
+    if file_name is None:
+        file_name = file_path
+
+    if file_name.endswith('.zip'):
+        # 处理 ZIP 文件
+        with zipfile.ZipFile(file_path, 'r') as z:
+            for inner_file in z.namelist():
+                with z.open(inner_file) as f:
+                    inner_file_content = BytesIO(f.read())
+                    yield from process_compressed_file(inner_file_content, inner_file)
+    elif file_name.endswith('.rar'):
+        # 处理 RAR 文件
+        with rarfile.RarFile(file_path) as rf:
+            for inner_file in rf.namelist():
+                with rf.open(inner_file) as f:
+                    inner_file_content = BytesIO(f.read())
+                    yield from process_compressed_file(inner_file_content, inner_file)
+    elif file_name.endswith('.csv'):
+        # 处理 CSV 文件(支持 GBK 编码)
+        if isinstance(file_path, BytesIO):
+            file_path.seek(0)  # 重置指针
+            df = pd.read_csv(file_path, encoding='gbk')
+        else:
+            df = pd.read_csv(file_path, encoding='gbk')
+        yield df
+    elif file_name.endswith('.csv.gz'):
+        # 处理 GZIP 压缩的 CSV 文件(支持 GBK 编码)
+        if isinstance(file_path, BytesIO):
+            file_path.seek(0)  # 重置指针
+            df = pd.read_csv(file_path, compression='gzip', encoding='gbk')
+        else:
+            df = pd.read_csv(file_path, compression='gzip', encoding='gbk')
+        yield df
+    elif isinstance(file_path, str) and os.path.isdir(file_path):
+        # 处理文件夹
+        for root, _, files in os.walk(file_path):
+            for file in files:
+                full_path = os.path.join(root, file)
+                yield from process_compressed_file(full_path, file)
+    else:
+        print(f"不支持的文件格式: {file_name}")
+
+
+# 多进程加多线程处理压缩文件
+def process_file_concurrently(file_path):
+    dfs = []
+    with ThreadPoolExecutor() as thread_pool:
+        futures = []
+        for df in process_compressed_file(file_path):
+            futures.append(thread_pool.submit(lambda x: x, df))  # 提交任务到线程池
+        for future in as_completed(futures):
+            dfs.append(future.result())  # 获取结果
+    return dfs
+
+
+# 主函数
+def main():
+    # 压缩文件路径
+    # 使用多进程处理压缩文件
+    dfs = []
+    with ProcessPoolExecutor() as process_pool:
+        futures = []
+        futures.append(process_pool.submit(process_file_concurrently, compressed_file_path))  # 提交任务到进程池
+        for future in as_completed(futures):
+            dfs.extend(future.result())  # 获取结果
+
+    # 合并所有 DataFrame
+    if dfs:
+        combined_df = pd.concat(dfs, ignore_index=True)
+        print(combined_df.head())
+    else:
+        print("未找到 CSV 文件")
+
+
+if __name__ == '__main__':
+    compressed_file_path = r'D:\data\data.zip'
+    main()

+ 155 - 0
tmp_file/大唐玉湖-箱变.py

@@ -0,0 +1,155 @@
+# -*- coding: utf-8 -*-
+"""
+Created on Tue Jul  9 16:28:48 2024
+
+@author: Administrator
+"""
+
+from datetime import datetime
+from os import *
+
+import chardet
+import pandas as pd
+
+
+# 获取文件编码
+def detect_file_encoding(filename):
+    # 读取文件的前1000个字节(足够用于大多数编码检测)
+    with open(filename, 'rb') as f:
+        rawdata = f.read(1000)
+    result = chardet.detect(rawdata)
+    encoding = result['encoding']
+
+    if encoding is None:
+        encoding = 'gb18030'
+
+    if encoding and encoding.lower() == 'gb2312' or encoding.lower().startswith("windows"):
+        encoding = 'gb18030'
+    return encoding
+
+
+# 读取数据到df
+def read_file_to_df(file_path, read_cols=list(), header=0):
+    df = pd.DataFrame()
+    if str(file_path).lower().endswith("csv") or str(file_path).lower().endswith("gz"):
+        encoding = detect_file_encoding(file_path)
+        end_with_gz = str(file_path).lower().endswith("gz")
+        if read_cols:
+            if end_with_gz:
+                df = pd.read_csv(file_path, encoding=encoding, usecols=read_cols, compression='gzip', header=header)
+            else:
+                df = pd.read_csv(file_path, encoding=encoding, usecols=read_cols, header=header, on_bad_lines='warn')
+        else:
+
+            if end_with_gz:
+                df = pd.read_csv(file_path, encoding=encoding, compression='gzip', header=header)
+            else:
+                df = pd.read_csv(file_path, encoding=encoding, header=header, on_bad_lines='warn')
+
+    else:
+        xls = pd.ExcelFile(file_path)
+        # 获取所有的sheet名称
+        sheet_names = xls.sheet_names
+        for sheet in sheet_names:
+            if read_cols:
+                df = pd.concat([df, pd.read_excel(xls, sheet_name=sheet, header=header, usecols=read_cols)])
+            else:
+                df = pd.concat([df, pd.read_excel(xls, sheet_name=sheet, header=header)])
+
+    return df
+
+
+def __build_directory_dict(directory_dict, path, filter_types=None):
+    # 遍历目录下的所有项
+    for item in listdir(path):
+        item_path = path.join(path, item)
+        if path.isdir(item_path):
+            __build_directory_dict(directory_dict, item_path, filter_types=filter_types)
+        elif path.isfile(item_path):
+            if path not in directory_dict:
+                directory_dict[path] = []
+
+            if filter_types is None or len(filter_types) == 0:
+                directory_dict[path].append(item_path)
+            elif str(item_path).split(".")[-1] in filter_types:
+                if str(item_path).count("~$") == 0:
+                    directory_dict[path].append(item_path)
+
+    # 读取所有文件
+
+
+# 读取路径下所有的excel文件
+def read_excel_files(read_path):
+    directory_dict = {}
+    __build_directory_dict(directory_dict, read_path, filter_types=['xls', 'xlsx', 'csv', 'gz'])
+
+    return [path for paths in directory_dict.values() for path in paths if path]
+
+
+# 创建路径
+def create_file_path(path, is_file_path=False):
+    if is_file_path:
+        path = path.dirname(path)
+
+    if not path.exists(path):
+        makedirs(path, exist_ok=True)
+
+
+def read_and_save_csv(file_path):
+    begin = datetime.now()
+    base_name = path.basename(file_path)
+    print('开始', base_name)
+
+    df1 = read_file_to_df(file_path + "箱变(1-8号逆变器)数据1.xls")
+    del df1['Unnamed: 0']
+    df1['时间'] = pd.to_datetime(df1['时间'])
+    df1.set_index(keys='时间', inplace=True)
+
+    df2 = read_file_to_df(file_path + "箱变(9-16号逆变器)数据1.xls")
+    del df2['Unnamed: 0']
+    df2['时间'] = pd.to_datetime(df2['时间'])
+    df2.set_index(keys='时间', inplace=True)
+
+    df3 = read_file_to_df(file_path + "箱变(1-8号逆变器)数据2.xls")
+    del df3['Unnamed: 0']
+    df3['时间'] = pd.to_datetime(df3['时间'])
+    df3.set_index(keys='时间', inplace=True)
+
+    df4 = read_file_to_df(file_path + "箱变(9-16号逆变器)数据2.xls")
+    del df4['Unnamed: 0']
+    df4['时间'] = pd.to_datetime(df4['时间'])
+    df4.set_index(keys='时间', inplace=True)
+
+    df = pd.concat([df1, df2, df3, df4], axis=1)
+    df.reset_index(inplace=True)
+    columns = list(df.columns)
+    columns.sort()
+
+    print(df.columns)
+
+    df = df[columns]
+    df.sort_values(by='时间', inplace=True)
+
+    df.to_csv(path.join(r'D:\trans_data\大唐玉湖性能分析离线分析', '05整理数据', base_name + '_箱变.csv'), encoding='utf-8',
+              index=False)
+    print('结束', base_name, '耗时:' + str(datetime.now() - begin))
+
+
+if __name__ == '__main__':
+
+    path = r'D:\trans_data\大唐玉湖性能分析离线分析\test'
+    all_files = read_excel_files(path)
+
+    all_paths = set()
+    for file in all_files:
+        base_name = path.basename(file).split("箱变")[0]
+        base_path = path.dirname(file)
+        if base_name not in all_paths:
+            all_paths.add(path.join(base_path, base_name))
+
+    all_datas = list(all_paths)
+    all_datas.sort()
+
+    print(all_datas)
+    # with Pool(1) as pool:
+    #     pool.starmap(read_and_save_csv, [(i,) for i in all_datas])

+ 158 - 0
tmp_file/大唐玉湖数据整理.py

@@ -0,0 +1,158 @@
+# -*- coding: utf-8 -*-
+"""
+Created on Tue Jul  9 16:28:48 2024
+
+@author: Administrator
+"""
+import multiprocessing
+from datetime import datetime
+from os import *
+
+import chardet
+import pandas as pd
+
+pd.options.mode.copy_on_write = True
+
+
+# 获取文件编码
+def detect_file_encoding(filename):
+    # 读取文件的前1000个字节(足够用于大多数编码检测)
+    with open(filename, 'rb') as f:
+        rawdata = f.read(1000)
+    result = chardet.detect(rawdata)
+    encoding = result['encoding']
+
+    if encoding is None:
+        encoding = 'gb18030'
+
+    if encoding and encoding.lower() == 'gb2312' or encoding.lower().startswith("windows"):
+        encoding = 'gb18030'
+    return encoding
+
+
+# 读取数据到df
+def read_file_to_df(file_path, read_cols=list(), header=0):
+    df = pd.DataFrame()
+    if str(file_path).lower().endswith("csv") or str(file_path).lower().endswith("gz"):
+        encoding = detect_file_encoding(file_path)
+        end_with_gz = str(file_path).lower().endswith("gz")
+        if read_cols:
+            if end_with_gz:
+                df = pd.read_csv(file_path, encoding=encoding, usecols=read_cols, compression='gzip', header=header)
+            else:
+                df = pd.read_csv(file_path, encoding=encoding, usecols=read_cols, header=header, on_bad_lines='warn')
+        else:
+
+            if end_with_gz:
+                df = pd.read_csv(file_path, encoding=encoding, compression='gzip', header=header)
+            else:
+                df = pd.read_csv(file_path, encoding=encoding, header=header, on_bad_lines='warn')
+
+    else:
+        xls = pd.ExcelFile(file_path)
+        # 获取所有的sheet名称
+        sheet_names = xls.sheet_names
+        for sheet in sheet_names:
+            if read_cols:
+                df = pd.concat([df, pd.read_excel(xls, sheet_name=sheet, header=header, usecols=read_cols)])
+            else:
+                df = pd.concat([df, pd.read_excel(xls, sheet_name=sheet, header=header)])
+
+    return df
+
+
+def __build_directory_dict(directory_dict, path, filter_types=None):
+    # 遍历目录下的所有项
+    for item in listdir(path):
+        item_path = path.join(path, item)
+        if path.isdir(item_path):
+            __build_directory_dict(directory_dict, item_path, filter_types=filter_types)
+        elif path.isfile(item_path):
+            if path not in directory_dict:
+                directory_dict[path] = []
+
+            if filter_types is None or len(filter_types) == 0:
+                directory_dict[path].append(item_path)
+            elif str(item_path).split(".")[-1] in filter_types:
+                if str(item_path).count("~$") == 0:
+                    directory_dict[path].append(item_path)
+
+    # 读取所有文件
+
+
+# 读取路径下所有的excel文件
+def read_excel_files(read_path):
+    directory_dict = {}
+    __build_directory_dict(directory_dict, read_path, filter_types=['xls', 'xlsx', 'csv', 'gz'])
+
+    return [path for paths in directory_dict.values() for path in paths if path]
+
+
+# 创建路径
+def create_file_path(path, is_file_path=False):
+    if is_file_path:
+        path = path.dirname(path)
+
+    if not path.exists(path):
+        makedirs(path, exist_ok=True)
+
+
+def generate_df(pv_df, col):
+    if col != '时间':
+        xiangbian = col.split("逆变器")[0].replace("#", "")
+        nibianqi = col.split("-")[0].split('逆变器')[1]
+        pv_index = col.split("-")[1].replace("PV", "")
+        now_df = pv_df[['时间', col + '输入电流()', col + '输入电压()']]
+        now_df.loc[:, '箱变'] = xiangbian
+        now_df.loc[:, '逆变器'] = nibianqi
+        now_df.loc[:, 'PV'] = pv_index
+        now_df.columns = [df_col.replace(col, "").replace("()", "") for df_col in now_df.columns]
+        now_df['输入电流'] = now_df['输入电流'].astype(float)
+        now_df['输入电压'] = now_df['输入电压'].astype(float)
+
+        print(xiangbian, nibianqi, pv_index, now_df.shape)
+        return now_df
+    return pd.DataFrame()
+
+
+def read_and_save_csv(file_path, save_path):
+    begin = datetime.now()
+    base_name = path.basename(file_path)
+    print('开始', base_name)
+
+    df = read_file_to_df(file_path)
+    df['时间'] = pd.to_datetime(df['时间'])
+    # df.set_index(keys='时间', inplace=True)
+
+    pv_df_cols = [col for col in df.columns if col.find('输入电') > -1]
+    pv_df_cols.append('时间')
+    pv_df = df[pv_df_cols]
+    shuru_cols = set([col.split("输入电")[0] for col in pv_df.columns])
+
+    with multiprocessing.Pool(6) as pool:
+        dfs = pool.starmap(generate_df, [(pv_df, col) for col in shuru_cols])
+
+    saved_pv_df = pd.concat(dfs)
+    saved_pv_df.sort_values(by=['箱变', '逆变器', 'PV', '时间'], inplace=True)
+    save_file = path.join(save_path, path.basename(file_path).split(".")[0], 'PV.csv')
+    create_file_path(save_file, True)
+
+    saved_pv_df.to_csv(save_file, encoding='utf-8', index=False)
+
+    print('结束', base_name, '耗时:' + str(datetime.now() - begin))
+
+
+if __name__ == '__main__':
+    path = r'D:\trans_data\大唐玉湖性能分析离线分析\test\yuanshi'
+    save_path = r'D:\trans_data\大唐玉湖性能分析离线分析\test\zhengli'
+    all_files = read_excel_files(path)
+
+    all_datas = list(all_files)
+    all_datas.sort()
+    print(all_datas)
+
+    for file in all_datas:
+        read_and_save_csv(file, save_path)
+
+    # with Pool(1) as pool:
+    #     pool.starmap(read_and_save_csv, [(i, save_path) for i in all_datas])

+ 209 - 0
tmp_file/大唐玉湖数据整理_1.py

@@ -0,0 +1,209 @@
+# -*- coding: utf-8 -*-
+"""
+Created on Tue Jul  9 16:28:48 2024
+
+@author: Administrator
+"""
+import multiprocessing
+from datetime import datetime
+from os import *
+
+import chardet
+import pandas as pd
+
+pd.options.mode.copy_on_write = True
+
+
+# 获取文件编码
+def detect_file_encoding(filename):
+    # 读取文件的前1000个字节(足够用于大多数编码检测)
+    with open(filename, 'rb') as f:
+        rawdata = f.read(1000)
+    result = chardet.detect(rawdata)
+    encoding = result['encoding']
+
+    if encoding is None:
+        encoding = 'gb18030'
+
+    if encoding and encoding.lower() == 'gb2312' or encoding.lower().startswith("windows"):
+        encoding = 'gb18030'
+    return encoding
+
+
+# 读取数据到df
+def read_file_to_df(file_path, read_cols=list(), header=0):
+    df = pd.DataFrame()
+    if str(file_path).lower().endswith("csv") or str(file_path).lower().endswith("gz"):
+        encoding = detect_file_encoding(file_path)
+        end_with_gz = str(file_path).lower().endswith("gz")
+        if read_cols:
+            if end_with_gz:
+                df = pd.read_csv(file_path, encoding=encoding, usecols=read_cols, compression='gzip', header=header)
+            else:
+                df = pd.read_csv(file_path, encoding=encoding, usecols=read_cols, header=header, on_bad_lines='warn')
+        else:
+
+            if end_with_gz:
+                df = pd.read_csv(file_path, encoding=encoding, compression='gzip', header=header)
+            else:
+                df = pd.read_csv(file_path, encoding=encoding, header=header, on_bad_lines='warn')
+
+    else:
+        xls = pd.ExcelFile(file_path)
+        # 获取所有的sheet名称
+        sheet_names = xls.sheet_names
+        for sheet in sheet_names:
+            if read_cols:
+                df = pd.concat([df, pd.read_excel(xls, sheet_name=sheet, header=header, usecols=read_cols)])
+            else:
+                df = pd.concat([df, pd.read_excel(xls, sheet_name=sheet, header=header)])
+
+    return df
+
+
+def __build_directory_dict(directory_dict, path, filter_types=None):
+    # 遍历目录下的所有项
+    for item in listdir(path):
+        item_path = path.join(path, item)
+        if path.isdir(item_path):
+            __build_directory_dict(directory_dict, item_path, filter_types=filter_types)
+        elif path.isfile(item_path):
+            if path not in directory_dict:
+                directory_dict[path] = []
+
+            if filter_types is None or len(filter_types) == 0:
+                directory_dict[path].append(item_path)
+            elif str(item_path).split(".")[-1] in filter_types:
+                if str(item_path).count("~$") == 0:
+                    directory_dict[path].append(item_path)
+
+    # 读取所有文件
+
+
+# 读取路径下所有的excel文件
+def read_excel_files(read_path):
+    directory_dict = {}
+    __build_directory_dict(directory_dict, read_path, filter_types=['xls', 'xlsx', 'csv', 'gz'])
+
+    return [path for paths in directory_dict.values() for path in paths if path]
+
+
+# 创建路径
+def create_file_path(path, is_file_path=False):
+    if is_file_path:
+        path = path.dirname(path)
+
+    if not path.exists(path):
+        makedirs(path, exist_ok=True)
+
+
+def generate_df(pv_df, col):
+    if col != '时间':
+        xiangbian = col.split("逆变器")[0].replace("#", "")
+        nibianqi = col.split("-")[0].split('逆变器')[1]
+        pv_index = col.split("-")[1].replace("PV", "")
+        now_df = pv_df[['时间', col + '输入电流()', col + '输入电压()']]
+        now_df.loc[:, '箱变'] = xiangbian
+        now_df.loc[:, '逆变器'] = nibianqi
+        now_df.loc[:, 'PV'] = pv_index
+        now_df.columns = [df_col.replace(col, "").replace("()", "") for df_col in now_df.columns]
+        now_df['输入电流'] = now_df['输入电流'].astype(float)
+        now_df['输入电压'] = now_df['输入电压'].astype(float)
+
+        print(xiangbian, nibianqi, pv_index, now_df.shape)
+        return now_df
+    return pd.DataFrame()
+
+
+def split_index(split_data: str, split_str: str):
+    count = split_data.find(split_str)
+    if count > -1:
+        return split_data[count + len(split_str):]
+    else:
+        return split_str
+
+
+def replece_col_to_biaozhun(col):
+    for k, v in dianjian_dict.items():
+        if col.find(k) > -1:
+            col = col.replace(k, v)
+            return col
+
+    return col
+
+
+def read_and_save_csv(file_path, save_path):
+    begin = datetime.now()
+    base_name = path.basename(file_path)
+    print('开始', base_name)
+
+    df = read_file_to_df(file_path)
+
+    for col in df.columns:
+        for del_col in del_cols:
+            if col.find(del_col) > -1:
+                del df[col]
+
+    df['时间'] = pd.to_datetime(df['时间'])
+    xiangbian = [col for col in df.columns if str(col).startswith('#') and str(col).find('逆变器') > -1][0].split("逆变器")[
+        0].replace("#", "")
+    df.columns = [xiangbian + "_" + split_index(df_col, "逆变器").replace('PV', "").replace("()", "").replace("-",
+                                                                                                           "_") if df_col.startswith(
+        "#") else df_col for df_col in
+                  df.columns]
+
+    df.columns = [col.replace("输入", "_输入") for col in df.columns]
+
+    df.columns = [replece_col_to_biaozhun(col) for col in df.columns]
+
+    # saved_pv_df = pd.concat(dfs)
+    df.sort_values(by=['时间'], inplace=True)
+    save_file = path.join(save_path, path.basename(file_path))
+    create_file_path(save_file, True)
+
+    df.to_csv(save_file, encoding='utf-8', index=False)
+
+    print('结束', base_name, '耗时:' + str(datetime.now() - begin))
+
+
+dianjian_data_str = """
+输入电压	支路输出电压
+输入电流	支路输出电流
+功率因数	
+总发电量	逆变器总发电量
+无功功率	
+有功功率	逆变器输出有功功率
+机内温度	逆变器温度
+电网AB线电压	交流输出电压
+电网A相电流	逆变器输出电流A相
+电网BC线电压	
+电网B相电流	逆变器输出电流B相
+电网CA线电压	
+电网C相电流	逆变器输出电流C相
+逆变器效率	逆变器转换效率
+"""
+
+dianjian_dict = {}
+del_cols = []
+for data in dianjian_data_str.split("\n"):
+    if data:
+        datas = data.split("\t")
+        if len(datas) == 2 and datas[1]:
+            dianjian_dict[datas[0]] = datas[1]
+        else:
+            del_cols.append(datas[0])
+
+if __name__ == '__main__':
+    path = r'/data/download/大唐玉湖性能分析离线分析/05整理数据/逆变器数据'
+    save_path = r'/data/download/大唐玉湖性能分析离线分析/06整理数据/逆变器数据'
+    all_files = read_excel_files(path)
+
+    all_datas = list(all_files)
+    all_datas.sort()
+    print(all_datas)
+    #
+    # for file in all_datas:
+    #     read_and_save_csv(file, save_path)
+
+    with multiprocessing.Pool(20) as pool:
+        pool.starmap(read_and_save_csv, [(i, save_path) for i in all_datas])

+ 283 - 0
tmp_file/大唐玉湖数据整理_2.py

@@ -0,0 +1,283 @@
+# -*- coding: utf-8 -*-
+"""
+Created on Tue Jul  9 16:28:48 2024
+
+@author: Administrator
+"""
+import multiprocessing
+from datetime import datetime
+from os import *
+
+import chardet
+import numpy as np
+import pandas as pd
+
+pd.options.mode.copy_on_write = True
+
+
+# 获取文件编码
+def detect_file_encoding(filename):
+    # 读取文件的前1000个字节(足够用于大多数编码检测)
+    with open(filename, 'rb') as f:
+        rawdata = f.read(1000)
+    result = chardet.detect(rawdata)
+    encoding = result['encoding']
+
+    if encoding is None:
+        encoding = 'gb18030'
+
+    if encoding and encoding.lower() == 'gb2312' or encoding.lower().startswith("windows"):
+        encoding = 'gb18030'
+    return encoding
+
+
+# 读取数据到df
+def read_file_to_df(file_path, read_cols=list(), header=0):
+    df = pd.DataFrame()
+    if str(file_path).lower().endswith("csv") or str(file_path).lower().endswith("gz"):
+        encoding = detect_file_encoding(file_path)
+        end_with_gz = str(file_path).lower().endswith("gz")
+        if read_cols:
+            if end_with_gz:
+                df = pd.read_csv(file_path, encoding=encoding, usecols=read_cols, compression='gzip', header=header)
+            else:
+                df = pd.read_csv(file_path, encoding=encoding, usecols=read_cols, header=header, on_bad_lines='warn')
+        else:
+
+            if end_with_gz:
+                df = pd.read_csv(file_path, encoding=encoding, compression='gzip', header=header)
+            else:
+                df = pd.read_csv(file_path, encoding=encoding, header=header, on_bad_lines='warn')
+
+    else:
+        xls = pd.ExcelFile(file_path)
+        # 获取所有的sheet名称
+        sheet_names = xls.sheet_names
+        for sheet in sheet_names:
+            if read_cols:
+                df = pd.concat([df, pd.read_excel(xls, sheet_name=sheet, header=header, usecols=read_cols)])
+            else:
+                df = pd.concat([df, pd.read_excel(xls, sheet_name=sheet, header=header)])
+
+    return df
+
+
+def __build_directory_dict(directory_dict, path, filter_types=None):
+    # 遍历目录下的所有项
+    for item in listdir(path):
+        item_path = path.join(path, item)
+        if path.isdir(item_path):
+            __build_directory_dict(directory_dict, item_path, filter_types=filter_types)
+        elif path.isfile(item_path):
+            if path not in directory_dict:
+                directory_dict[path] = []
+
+            if filter_types is None or len(filter_types) == 0:
+                directory_dict[path].append(item_path)
+            elif str(item_path).split(".")[-1] in filter_types:
+                if str(item_path).count("~$") == 0:
+                    directory_dict[path].append(item_path)
+
+    # 读取所有文件
+
+
+# 读取路径下所有的excel文件
+def read_excel_files(read_path):
+    directory_dict = {}
+    __build_directory_dict(directory_dict, read_path, filter_types=['xls', 'xlsx', 'csv', 'gz'])
+
+    return [path for paths in directory_dict.values() for path in paths if path]
+
+
+# 创建路径
+def create_file_path(path, is_file_path=False):
+    if is_file_path:
+        path = path.dirname(path)
+
+    if not path.exists(path):
+        makedirs(path, exist_ok=True)
+
+
+def split_index(split_data: str, split_str: str):
+    count = split_data.find(split_str)
+    if count > -1:
+        return split_data[count + len(split_str):]
+    else:
+        return split_str
+
+
+def replece_col_to_biaozhun(col):
+    for k, v in dianjian_dict.items():
+        if col.find(k) > -1:
+            col = col.replace(k, v)
+            return col
+
+    return col
+
+
+def row_to_datas(row, pv_dict, inverter_cols, df_cols):
+    row_datas = list(list())
+    for xiangbian in pv_dict.keys():
+        for nibianqi in pv_dict[xiangbian].keys():
+            for pv in pv_dict[xiangbian][nibianqi]:
+                datas = [np.nan] * 14
+                datas[0] = row['时间']
+                datas[1] = xiangbian
+                datas[2] = nibianqi
+                datas[3] = pv
+                datas_4_col = "_".join([str(xiangbian), str(nibianqi), str(pv), '支路输出电压'])
+                if datas_4_col in df_cols:
+                    datas[4] = row[datas_4_col]
+                else:
+                    datas[4] = np.nan
+
+                datas_5_col = "_".join([str(xiangbian), str(nibianqi), str(pv), '支路输出电流'])
+                if datas_5_col in df_cols:
+                    datas[5] = row[datas_5_col]
+                else:
+                    datas[5] = np.nan
+
+                row_datas.append(datas)
+
+    for xiangbian in pv_dict.keys():
+        for nibianqi in pv_dict[xiangbian].keys():
+            datas = [np.nan] * 14
+            datas[0] = row['时间']
+            datas[1] = xiangbian
+            datas[2] = nibianqi
+            datas[3] = 0
+            for index, col_name in enumerate(inverter_cols):
+                col = '_'.join([str(xiangbian), str(nibianqi), col_name])
+                if col in df_cols:
+                    datas[index + 6] = row[col]
+                else:
+                    datas[index + 6] = np.nan
+
+            row_datas.append(datas)
+
+    return row_datas
+
+
+def df_to_biaozhun(df):
+    pv_cols = ['支路输出电压', '支路输出电流']
+    inverter_cols = ['逆变器总发电量', '逆变器输出有功功率', '逆变器温度', '交流输出电压', '逆变器输出电流A相', '逆变器输出电流B相', '逆变器输出电流C相', '逆变器转换效率']
+    # 从列名获取箱变->逆变器->PV等的字典
+    pv_dict = dict(dict())
+    for col in df.columns:
+        for pv_col in pv_cols:
+            if str(col).endswith(pv_col):
+                datas = col.split("_")
+                xiangbian = datas[0]
+                nibiangqi = datas[1]
+                pv = datas[2]
+
+                if xiangbian in pv_dict.keys():
+                    if nibiangqi in pv_dict[xiangbian]:
+                        pv_dict[xiangbian][nibiangqi].add(pv)
+                    else:
+                        pv_dict[xiangbian][nibiangqi] = set([pv])
+                else:
+                    pv_dict[xiangbian] = {nibiangqi: set([pv])}
+
+    results = df.apply(row_to_datas, args=(pv_dict, inverter_cols, df.columns), axis=1)
+
+    df_datas = results.to_list()
+    df_datas = [da for data in df_datas for da in data]
+    df_cols = ["时间", "箱变", "逆变器", "支路"]
+    df_cols.extend(pv_cols)
+    df_cols.extend(inverter_cols)
+    df = pd.DataFrame(df_datas, columns=df_cols)
+
+    type_conver_list = []
+    type_conver_list.extend(pv_cols)
+    type_conver_list.extend(inverter_cols)
+    for type_conver in type_conver_list:
+        df[type_conver] = pd.to_numeric(df[type_conver], errors='coerce')
+
+    return df
+
+
+def read_and_save_csv(file_path, save_path):
+    begin = datetime.now()
+    base_name = path.basename(file_path)
+    print('开始', base_name)
+
+    df = read_file_to_df(file_path)
+
+    for col in df.columns:
+        for del_col in del_cols:
+            if col.find(del_col) > -1:
+                del df[col]
+
+    df['时间'] = pd.to_datetime(df['时间'])
+    xiangbian = [col for col in df.columns if str(col).startswith('#') and str(col).find('逆变器') > -1][0].split("逆变器")[
+        0].replace("#", "")
+    df.columns = [xiangbian + "_" + split_index(df_col, "逆变器").replace('PV', "").replace("()", "").replace("-",
+                                                                                                           "_") if df_col.startswith(
+        "#") else df_col for df_col in
+                  df.columns]
+
+    df.columns = [col.replace("输入", "_输入") for col in df.columns]
+    df.columns = [replece_col_to_biaozhun(col) for col in df.columns]
+
+    df = df_to_biaozhun(df)
+
+    # df.sort_values(by=['时间', "箱变", "逆变器", "支路"], inplace=True)
+    # save_file = path.join(save_path, path.basename(file_path))
+    # create_file_path(save_file, True)
+
+    # df.to_csv(save_file, encoding='utf-8', index=False)
+
+    print('结束', base_name, '耗时:' + str(datetime.now() - begin))
+    return df
+
+dianjian_data_str = """
+输入电压	支路输出电压
+输入电流	支路输出电流
+功率因数	
+总发电量	逆变器总发电量
+无功功率	
+有功功率	逆变器输出有功功率
+机内温度	逆变器温度
+电网AB线电压	交流输出电压
+电网A相电流	逆变器输出电流A相
+电网BC线电压	
+电网B相电流	逆变器输出电流B相
+电网CA线电压	
+电网C相电流	逆变器输出电流C相
+逆变器效率	逆变器转换效率
+"""
+
+dianjian_dict = {}
+del_cols = []
+for data in dianjian_data_str.split("\n"):
+    if data:
+        datas = data.split("\t")
+        if len(datas) == 2 and datas[1]:
+            dianjian_dict[datas[0]] = datas[1]
+        else:
+            del_cols.append(datas[0])
+
+if __name__ == '__main__':
+    path = r'/data/download/大唐玉湖性能分析离线分析/05整理数据/逆变器数据'
+    save_path = r'/data/download/大唐玉湖性能分析离线分析/06整理数据/逆变器数据'
+    # path = r'D:\trans_data\大唐玉湖性能分析离线分析\test\yuanshi'
+    # save_path = r'D:\trans_data\大唐玉湖性能分析离线分析\test\zhengli'
+    all_files = read_excel_files(path)
+
+    all_datas = list(all_files)
+    all_datas.sort()
+    print(all_datas)
+
+    # for file in all_datas:
+    #     read_and_save_csv(file, save_path)
+
+    with multiprocessing.Pool(40) as pool:
+        dfs = pool.starmap(read_and_save_csv, [(i, save_path) for i in all_datas])
+
+    saved_pv_df = pd.concat(dfs)
+    saved_pv_df.sort_values(by=['时间', "箱变", "逆变器", "支路"], inplace=True)
+    save_file = path.join(save_path, "合并.csv")
+    create_file_path(save_file, True)
+    saved_pv_df.sort_values(by=['时间', "箱变", "逆变器", "支路"], inplace=True)
+    saved_pv_df.to_csv(save_file, encoding='utf-8', index=False)

+ 122 - 0
tmp_file/大唐玉湖气象合并.py

@@ -0,0 +1,122 @@
+# -*- coding: utf-8 -*-
+"""
+Created on Tue Jul  9 16:28:48 2024
+
+@author: Administrator
+"""
+from os import *
+
+import chardet
+import pandas as pd
+
+pd.options.mode.copy_on_write = True
+
+
+# 获取文件编码
+def detect_file_encoding(filename):
+    # 读取文件的前1000个字节(足够用于大多数编码检测)
+    with open(filename, 'rb') as f:
+        rawdata = f.read(1000)
+    result = chardet.detect(rawdata)
+    encoding = result['encoding']
+
+    if encoding is None:
+        encoding = 'gb18030'
+
+    if encoding and encoding.lower() == 'gb2312' or encoding.lower().startswith("windows"):
+        encoding = 'gb18030'
+    return encoding
+
+
+# 读取数据到df
+def read_file_to_df(file_path, read_cols=list(), header=0):
+    df = pd.DataFrame()
+    if str(file_path).lower().endswith("csv") or str(file_path).lower().endswith("gz"):
+        encoding = detect_file_encoding(file_path)
+        end_with_gz = str(file_path).lower().endswith("gz")
+        if read_cols:
+            if end_with_gz:
+                df = pd.read_csv(file_path, encoding=encoding, usecols=read_cols, compression='gzip', header=header)
+            else:
+                df = pd.read_csv(file_path, encoding=encoding, usecols=read_cols, header=header, on_bad_lines='warn')
+        else:
+
+            if end_with_gz:
+                df = pd.read_csv(file_path, encoding=encoding, compression='gzip', header=header)
+            else:
+                df = pd.read_csv(file_path, encoding=encoding, header=header, on_bad_lines='warn')
+
+    else:
+        xls = pd.ExcelFile(file_path)
+        # 获取所有的sheet名称
+        sheet_names = xls.sheet_names
+        for sheet in sheet_names:
+            if read_cols:
+                df = pd.concat([df, pd.read_excel(xls, sheet_name=sheet, header=header, usecols=read_cols)])
+            else:
+                df = pd.concat([df, pd.read_excel(xls, sheet_name=sheet, header=header)])
+
+    return df
+
+
+def __build_directory_dict(directory_dict, path, filter_types=None):
+    # 遍历目录下的所有项
+    for item in listdir(path):
+        item_path = path.join(path, item)
+        if path.isdir(item_path):
+            __build_directory_dict(directory_dict, item_path, filter_types=filter_types)
+        elif path.isfile(item_path):
+            if path not in directory_dict:
+                directory_dict[path] = []
+
+            if filter_types is None or len(filter_types) == 0:
+                directory_dict[path].append(item_path)
+            elif str(item_path).split(".")[-1] in filter_types:
+                if str(item_path).count("~$") == 0:
+                    directory_dict[path].append(item_path)
+
+    # 读取所有文件
+
+
+# 读取路径下所有的excel文件
+def read_excel_files(read_path):
+    directory_dict = {}
+    __build_directory_dict(directory_dict, read_path, filter_types=['xls', 'xlsx', 'csv', 'gz'])
+
+    return [path for paths in directory_dict.values() for path in paths if path]
+
+
+# 创建路径
+def create_file_path(path, is_file_path=False):
+    if is_file_path:
+        path = path.dirname(path)
+
+    if not path.exists(path):
+        makedirs(path, exist_ok=True)
+
+
+if __name__ == '__main__':
+    # path = r'/data/download/大唐玉湖性能分析离线分析/05整理数据/气象站数据'
+    # save_path = r'/data/download/大唐玉湖性能分析离线分析/06整理数据/气象站数据'
+    path = r'Z:\大唐玉湖性能分析离线分析\05整理数据\气象站数据'
+    save_path = r'Z:\大唐玉湖性能分析离线分析\06整理数据\气象站数据'
+
+    fengsu_df = read_file_to_df(path.join(path, '风速.csv'), read_cols=['当前时间', '实际风速'])
+    fengxiang_df = read_file_to_df(path.join(path, '风向.csv'), read_cols=['当前时间', '实际风向'])
+    fuzhaodu_df = read_file_to_df(path.join(path, '辐照度.csv'), read_cols=['时间', '水平总辐照度', '倾斜总辐照度', '散射辐照度'])
+    shidu_df = read_file_to_df(path.join(path, '湿度.csv'), read_cols=['时间', '实际湿度'])
+    wendu_df = read_file_to_df(path.join(path, '温度.csv'), read_cols=['时间', '实际温度'])
+    yali_df = read_file_to_df(path.join(path, '压力.csv'), read_cols=['时间', '实际气压'])
+
+    fengsu_df.rename(columns={'当前时间': '时间'}, inplace=True)
+    fengxiang_df.rename(columns={'当前时间': '时间'}, inplace=True)
+
+    dfs = [fengxiang_df, fengsu_df, fuzhaodu_df, shidu_df, wendu_df, yali_df]
+
+    for df in dfs:
+        df['时间'] = pd.to_datetime(df['时间'])
+        df.set_index(keys='时间', inplace=True)
+
+    df = pd.concat(dfs, axis=1)
+    create_file_path(save_path, is_file_path=False)
+    df.to_csv(path.join(save_path, '气象合并.csv'), encoding='utf-8')

+ 96 - 0
tmp_file/年度平均缺失率.py

@@ -0,0 +1,96 @@
+import calendar
+import datetime
+import math
+import multiprocessing
+
+import pandas as pd
+
+from utils.file.trans_methods import read_excel_files, read_file_to_df
+
+
+def get_year_days(year):
+    now_year = datetime.datetime.now().year
+
+    if now_year == year:
+        today = datetime.date.today()
+        # 获取昨天的日期
+        yesterday = today - datetime.timedelta(days=1)
+        # 获取今年的第一天
+        start_of_year = datetime.date(yesterday.year, 1, 1)
+        # 计算从年初到昨天的天数
+        return (yesterday - start_of_year).days + 1
+
+    if calendar.isleap(year):
+        return 366
+    else:
+        return 365
+
+
+def save_percent(value, save_decimal=7):
+    return round(value, save_decimal) * 100
+
+
+if __name__ == '__main__':
+
+    read_dir = r'D:\data\综合报表22-24年'
+
+    all_fils = read_excel_files(read_dir)
+
+    with multiprocessing.Pool(6) as pool:
+        dfs = pool.map(read_file_to_df, all_fils)
+
+    df = pd.concat(dfs, ignore_index=True)
+    del_cols = ['Unnamed: 0', '序号', 'times']
+
+    for col in del_cols:
+        del df[col]
+
+    df = df.query("风机 != '完整'")
+
+    numic_cols = ['数据有效性', '历史总有功发电量', '历史总有功耗电量',
+                  '查询区间有功发电量', '查询区间有功耗电量', '历史总无功发电量', '历史总无功耗电量',
+                  '查询区间无功发电量',
+                  '查询区间无功耗电量', '时间可利用率', '最大风速', '最小风速', '平均风速', '空气密度', '最大有功功率',
+                  '最小有功功率', '平均有功功率', '平均无功功率', '电网停机次数', '累计运行时间', '有效风时数',
+                  '满发时间',
+                  '启动时间', '启动次数', '并网发电时间', '等效发电时间', '正常发电时间', '调度限功率发电时间',
+                  '风机限功率发电时间',
+                  '停机时间', '维护停机时间', '故障停机时间', '调度停机时间', '气象停机时间', '电网停机时间',
+                  '远程停机时间',
+                  '待机时间', '户外平均温度', '机舱最高温度', '维护停机次数', '气象停机次数', '故障停机次数',
+                  '报警发电时间',
+                  '报警发电次数', '偏航时长', '偏航次数', '通讯中断时间', '通讯故障次数', '调度限功率发电损失电量',
+                  '风机限功率发电损失电量', '气象停机损失电量', '调度限功率停机损失电量', '远程停机损失电量',
+                  '维护停机损失电量',
+                  '风机故障停机损失电量', '电网停机损失电量']
+
+    for numic_col in numic_cols:
+        df[numic_col] = pd.to_numeric(df[numic_col], errors='coerce')
+
+    cols = df.columns
+    df['year'] = pd.to_datetime(df['时间'], errors='coerce').dt.year
+
+    group_df = df.groupby(by=['year', '风机']).count()
+    group_df.reset_index(inplace=True)
+    count_df = pd.DataFrame(group_df)
+
+    # now_df.to_csv('聚合后.csv', encoding='utf-8', index=False)
+
+    years = count_df['year'].unique()
+    wind_names = count_df['风机'].unique()
+    numic_cols.insert(0, '时间')
+
+    result_df = pd.DataFrame()
+    for year in years:
+        year_days = get_year_days(year)
+        for wind_name in wind_names:
+            count = count_df[(count_df['year'] == year) & (count_df['风机'] == wind_name)][numic_cols].values[0].sum()
+            print(year, wind_name, count, len(numic_cols) * year_days)
+            now_df = pd.DataFrame()
+            now_df['时间'] = [int(year)]
+            now_df['风机'] = [wind_name]
+            now_df['缺失均值'] = [save_percent(count / (len(numic_cols) * year_days))]
+
+            result_df = pd.concat([result_df, now_df])
+
+    result_df.to_csv('年度平均缺失率.csv', encoding='utf-8', index=False)

+ 106 - 0
tmp_file/张崾先振动.py

@@ -0,0 +1,106 @@
+import datetime
+import multiprocessing
+import os
+from concurrent.futures.thread import ThreadPoolExecutor
+
+import pandas as pd
+
+
+def __build_directory_dict(directory_dict, path, filter_types=None):
+    # 遍历目录下的所有项
+    for item in os.listdir(path):
+        item_path = os.path.join(path, item)
+        if os.path.isdir(item_path):
+            __build_directory_dict(directory_dict, item_path, filter_types=filter_types)
+        elif os.path.isfile(item_path):
+            if path not in directory_dict:
+                directory_dict[path] = []
+
+            if filter_types is None or len(filter_types) == 0:
+                directory_dict[path].append(item_path)
+            elif str(item_path).split(".")[-1] in filter_types:
+                if str(item_path).count("~$") == 0:
+                    directory_dict[path].append(item_path)
+
+
+# 读取路径下所有的excel文件
+def read_excel_files(read_path, filter_types=None):
+    if filter_types is None:
+        filter_types = ['xls', 'xlsx', 'csv', 'gz']
+    if os.path.isfile(read_path):
+        return [read_path]
+
+    directory_dict = {}
+    __build_directory_dict(directory_dict, read_path, filter_types=filter_types)
+
+    return [path for paths in directory_dict.values() for path in paths if path]
+
+
+# 读取路径下所有的文件
+def read_files(read_path, filter_types=None):
+    if filter_types is None:
+        filter_types = ['xls', 'xlsx', 'csv', 'gz', 'zip', 'rar']
+    if os.path.isfile(read_path):
+        return [read_path]
+    directory_dict = {}
+    __build_directory_dict(directory_dict, read_path, filter_types=filter_types)
+
+    return [path1 for paths in directory_dict.values() for path1 in paths if path1]
+
+
+def get_line_count(file_path):
+    with open(file_path, 'r', encoding='utf-8') as file:
+        return sum(1 for _ in file)
+
+
+def read_file_and_read_count_exec(file_path):
+    base_name = os.path.basename(file_path).split('.')[0]
+    cols = base_name.split('_')
+    cols.append(get_line_count(file_path))
+    return cols
+
+
+def read_file_and_read_count(index, file_paths, datas):
+    pretty_print(f'开始执行:{index + 1}')
+    with ThreadPoolExecutor(max_workers=10) as executor:
+        colses = list(executor.map(read_file_and_read_count_exec, file_paths))
+
+    datas.extend(colses)
+    pretty_print(f'结束执行:{index + 1}],数据长度:{len(datas)}')
+
+
+def get_name(x):
+    result_str = ''
+    if x['col3'] != '无':
+        result_str += x['col3']
+    result_str += x['col2']
+    if x['col4'] != '无':
+        result_str += x['col4']
+    result_str += x['col6']
+    return result_str
+
+
+def split_array(array, num):
+    return [array[i:i + num] for i in range(0, len(array), num)]
+
+
+def pretty_print(*args):
+    print(datetime.datetime.now(), ",".join([str(arg) for arg in args]))
+
+
+if __name__ == '__main__':
+    datas = multiprocessing.Manager().list()
+    all_files = read_files(r'D:\cms数据\张崾先风电场2期-导出\CMSFTPServer\ZYXFDC2', ['txt'])
+    # all_files = read_files(r'D:\cms数据\测试\result\CMSFTPServer\ZYXFDC2', ['txt'])
+    pretty_print(f"文件长度{len(all_files)}")
+    arrays = split_array(all_files, 5000)
+    pretty_print(f"切分个数{len(arrays)}")
+    with multiprocessing.Pool(10) as pool:
+        pool.starmap(read_file_and_read_count, [(index, file_paths, datas) for index, file_paths in enumerate(arrays)])
+
+    df = pd.DataFrame(data=list(datas), columns=[f'col{i}' for i in range(10)])
+
+    df['col8'] = pd.to_datetime(df['col8'], format='%Y%m%d%H%M%S', errors='coerce')
+    df.sort_values(by=['col1', 'col8'], inplace=True)
+    df['测点完整名称'] = df.apply(get_name, axis=1)
+    df.to_csv('d://cms数据//cms_data.csv', index=False, encoding='utf8')

+ 32 - 0
tmp_file/张崾先故障.py

@@ -0,0 +1,32 @@
+import multiprocessing
+import os
+
+import pandas as pd
+
+read_dir = 'D:\data\张崾先风电场\故障事件数据'
+save_dir = 'D:\data\崾先风电场\故障事件数据整理'
+
+print(os.listdir(read_dir))
+
+
+def read_solve_data(file_dir):
+    base_dir = os.path.basename(file_dir)
+    df = pd.DataFrame()
+    for file in os.listdir(file_dir):
+        df = pd.concat([df, pd.read_csv(file_dir + '/' + file, encoding='gbk')])
+
+    df['开始时间'] = pd.to_datetime(df['开始时间'], errors='coerce')
+    df = df.query("(开始时间 >= '2024-01-01 00:00:00') & (开始时间 < '2024-12-01 00:00:00')")
+    df['month'] = df['开始时间'].dt.month
+    months = df['month'].unique()
+    for month in months:
+        df_month = df[df['month'] == month]
+        os.makedirs(save_dir + os.sep + base_dir, exist_ok=True)
+        df_month.to_csv(save_dir + os.sep + base_dir + os.sep + str(month) + '.csv', index=False)
+
+
+if __name__ == '__main__':
+    dirs = os.listdir(read_dir)
+
+    with multiprocessing.Pool(4) as pool:
+        pool.map(read_solve_data, [read_dir + os.sep + i for i in dirs])

+ 67 - 0
tmp_file/张崾先统计缺失率-分.py

@@ -0,0 +1,67 @@
+import multiprocessing
+import os
+import sys
+
+sys.path.insert(0, os.path.abspath(__file__).split("tmp_file")[0])
+
+import pandas as pd
+
+from utils.file.trans_methods import read_file_to_df
+
+
+def save_percent(value, save_decimal=7):
+    return round(value, save_decimal) * 100
+
+
+def read_and_select(file_path, read_cols):
+    result_df = pd.DataFrame()
+    df = read_file_to_df(file_path, read_cols=read_cols)
+    wind_name = os.path.basename(file_path).split('.')[0]
+    df['风机号'] = wind_name
+    df = df.query("(startTime>='2023-10-01 00:00:00') & (startTime<'2024-10-01 00:00:00')")
+    count = 366 * 24 * 6  # 十分钟数据  2024年366天
+    repeat_time_count = df.shape[0] - len(df['startTime'].unique())
+    print(wind_name, count, repeat_time_count)
+    result_df['风机号'] = [wind_name]
+    result_df['重复率'] = [save_percent(repeat_time_count / count)]
+    result_df['重复次数'] = [repeat_time_count]
+    result_df['总记录数'] = [count]
+
+    for read_col in read_cols:
+
+        if read_col != 'startTime':
+            df[read_col] = pd.to_numeric(df[read_col], errors='coerce')
+        else:
+            df[read_col] = pd.to_datetime(df[read_col], errors='coerce')
+
+    group_df = df.groupby(by=['风机号']).count()
+    group_df.reset_index(inplace=True)
+    count_df = pd.DataFrame(group_df)
+    total_count = count_df[read_cols].values[0].sum()
+    print(wind_name, total_count, count * len(read_cols))
+    result_df['平均缺失率,单位%'] = [save_percent(1 - total_count / (count * len(read_cols)))]
+    result_df['缺失数值'] = ['-'.join([str(count - i) for i in count_df[read_cols].values[0]])]
+    del group_df
+
+    error_fengsu_count = df.query("(风速10min < 0) | (风速10min > 80)").shape[0]
+    error_yougong_gonglv = df.query("(有功功率 < -200) | (有功功率 > 4800)").shape[0]
+
+    result_df['平均异常率'] = [save_percent((error_fengsu_count + error_yougong_gonglv) / (2 * count))]
+
+    return result_df
+
+
+if __name__ == '__main__':
+    read_cols_str = 'startTime,有功功率,叶轮转速,发电机转速,风速10min,桨叶1角度,桨叶2角度,桨叶3角度,机舱位置,偏航误差,发电机轴承温度,机舱内温度,环境温度,发电机U相温度,发电机V相温度,发电机W相温度'
+    read_cols = [i for i in read_cols_str.split(",") if i]
+    read_dir = r'/data/download/collection_data/1进行中/张崾先风电场-陕西-华电/收资数据/导出数据2'
+
+    files = os.listdir(read_dir)
+
+    with multiprocessing.Pool(16) as pool:
+        dfs = pool.starmap(read_and_select, [(os.path.join(read_dir, i), read_cols) for i in files])
+
+    df = pd.concat(dfs, ignore_index=True)
+    df.sort_values(by=['风机号'], inplace=True)
+
+    df.to_csv("张崾先统计-分钟.csv", encoding='utf8', index=False)

+ 92 - 0
tmp_file/张崾先统计缺失率.py

@@ -0,0 +1,92 @@
+import multiprocessing
+import os
+import sys
+
+sys.path.insert(0, os.path.abspath(__file__).split("tmp_file")[0])
+
+import pandas as pd
+
+from utils.file.trans_methods import read_file_to_df
+
+
+def save_percent(value, save_decimal=7):
+    return round(value, save_decimal) * 100
+
+
+def read_and_select(file_path, read_cols):
+    result_df = pd.DataFrame()
+    df = read_file_to_df(file_path, read_cols=read_cols)
+    wind_name = os.path.basename(file_path).split('.')[0]
+    df['风机号'] = wind_name
+    df = df.query("(Time>='2024-06-01 00:00:00') & (Time<'2024-12-01 00:00:00')")
+    count = 15811200  # 1秒数据  半年
+    repeat_time_count = df.shape[0] - len(df['Time'].unique())
+    print(wind_name, count, repeat_time_count)
+    result_df['风机号'] = [wind_name]
+    result_df['重复率'] = [save_percent(repeat_time_count / count)]
+    result_df['重复次数'] = [repeat_time_count]
+    result_df['总记录数'] = [count]
+
+    for read_col in read_cols:
+
+        if read_col != 'Time':
+            df[read_col] = pd.to_numeric(df[read_col], errors='coerce')
+        else:
+            df[read_col] = pd.to_datetime(df[read_col], errors='coerce')
+
+    group_df = df.groupby(by=['风机号']).count()
+    group_df.reset_index(inplace=True)
+    count_df = pd.DataFrame(group_df)
+    total_count = count_df[read_cols].values[0].sum()
+    print(wind_name, total_count, count * len(read_cols))
+    result_df['平均缺失率,单位%'] = [save_percent(1 - total_count / (count * len(read_cols)))]
+    result_df['缺失数值'] = ['-'.join([str(count - i) for i in count_df[read_cols].values[0]])]
+    del group_df
+
+    fengsu_count = 0
+    fengsu_cols = [i for i in read_cols if '风速' in i]
+    fengsu_str = ''
+    for col in fengsu_cols:
+        now_count = df.query("(" + col + " < 0) | (" + col + " > 80)").shape[0]
+        fengsu_count = fengsu_count + now_count
+        fengsu_str = fengsu_str + ',' + col + ':' + str(fengsu_count)
+    result_df['风速异常'] = [fengsu_str]
+
+    gonglv_cols = ['有功功率', '瞬时功率', '当前理论可发最大功率']
+    gonglv_count = 0
+    gonglv_str = ''
+    for col in gonglv_cols:
+        now_count = df.query("(" + col + " < -200) | (" + col + " > 3000)").shape[0]
+        gonglv_count = gonglv_count + now_count
+        gonglv_str = gonglv_str + ',' + col + ':' + str(gonglv_count)
+    result_df['功率异常'] = [gonglv_str]
+
+    result_df['平均异常率'] = [
+        save_percent((fengsu_count + fengsu_count) / ((len(fengsu_cols) + len(gonglv_cols)) * count))]
+
+    return result_df
+
+
+if __name__ == '__main__':
+    read_cols = ['Time', '设备主要状态', '功率曲线风速', '湍流强度', '实际风速', '有功功率', '桨叶角度A', '桨叶角度B',
+                 '桨叶角度C', '机舱内温度', '机舱外温度', '绝对风向', '机舱绝对位置', '叶轮转速', '发电机转速',
+                 '瞬时风速',
+                 '有功设定反馈', '当前理论可发最大功率', '空气密度', '偏航误差', '发电机扭矩', '瞬时功率', '风向1s',
+                 '偏航压力', '桨叶1速度', '桨叶2速度', '桨叶3速度', '桨叶1角度给定', '桨叶2角度给定', '桨叶3角度给定',
+                 '轴1电机电流', '轴2电机电流', '轴3电机电流', '轴1电机温度', '轴2电机温度', '轴3电机温度', '待机',
+                 '启动',
+                 '偏航', '并网', '限功率', '正常发电', '故障', '计入功率曲线', '运行发电机冷却风扇1',
+                 '运行发电机冷却风扇2',
+                 '激活偏航解缆阀', '激活偏航刹车阀', '激活风轮刹车阀', '激活顺时针偏航', '激活逆时针偏航', '电缆扭角']
+
+    read_dir = r'/data/download/collection_data/1进行中/张崾先风电场-陕西-华电/清理数据/点检表以外测点儿-20241210'
+
+    files = os.listdir(read_dir)
+
+    with multiprocessing.Pool(4) as pool:
+        dfs = pool.starmap(read_and_select, [(os.path.join(read_dir, i), read_cols) for i in files])
+
+    df = pd.concat(dfs, ignore_index=True)
+    df.sort_values(by=['风机号'], inplace=True)
+
+    df.to_csv("张崾先统计-秒.csv", encoding='utf8', index=False)

+ 31 - 0
tmp_file/故障时间整理.py

@@ -0,0 +1,31 @@
+import pandas as pd
+
+df = pd.read_csv(r'C:\Users\Administrator\Documents\WeChat Files\anmox-\FileStorage\File\2024-12\26故障.csv',
+                 encoding='gbk')
+df['开始时间'] = pd.to_datetime(df['开始时间'], errors='coerce')
+df['结束时间'] = pd.to_datetime(df['结束时间'], errors='coerce')
+time_df = pd.DataFrame(df.groupby(['开始时间'])['结束时间'].max())
+time_df.reset_index(inplace=True)
+time_df.sort_values(by='开始时间', inplace=True)
+
+datas = set()
+max_row = None
+for index, row in time_df.iterrows():
+    if index == 0:
+        datas.add((row['开始时间'], row['结束时间']))
+        max_row = row
+        continue
+
+    if row['结束时间'] > max_row['结束时间']:
+        datas.add((row['开始时间'], row['结束时间']))
+        max_row = row
+
+result_df = pd.DataFrame()
+for begin, end in datas:
+    print(begin, end)
+    now_df = df[(df['开始时间'] == begin) & (df['结束时间'] == end)]
+    now_df = now_df.tail(1)
+    result_df = pd.concat([result_df, now_df])
+
+result_df.sort_values(by='开始时间', inplace=True)
+result_df.to_csv(r'd:\data\26故障_new.csv', encoding='utf8', index=False)

+ 97 - 0
tmp_file/新华水电列名对比.py

@@ -0,0 +1,97 @@
+from utils.file.trans_methods import *
+from os import path
+
+def boolean_is_check_data(df_cols):
+    fault_list = ['快速停机', '故障名称', '故障代码', '故障停机', '人工停机', '风机紧急停机', '工作模式']
+
+    df_cols = [str(i).split('_')[-1] for i in df_cols]
+    for fault in fault_list:
+        if fault in df_cols:
+            return True
+
+    return False
+
+
+def compareTwoFolders(list1, other_dfs):
+    for is_falut in [True]:
+        result_df = pd.DataFrame()
+        # for df1 in df1s:
+        #     tmp_list = [str(i).split('_')[-1] for i in list(df1.columns) if i != 'sheet_name']
+        #     if is_falut:
+        #         if boolean_is_check_data(df1.columns):
+        #             list1.extend(tmp_list)
+        #     else:
+        #         if not boolean_is_check_data(df1.columns):
+        #             list1.extend(tmp_list)
+
+        set1 = set(list1)
+
+        list1 = list(set1)
+        list1.sort()
+
+        result_df['完整列名'] = list1
+
+        for wind_name, dfs in other_dfs.items():
+
+            list2 = list()
+            for df in dfs:
+                tmp_list = [str(i).split('_')[-1] for i in list(df.columns) if i != 'sheet_name']
+                if is_falut:
+                    if boolean_is_check_data(df.columns):
+                        list2.extend(tmp_list)
+                else:
+                    if not boolean_is_check_data(df.columns):
+                        list2.extend(tmp_list)
+
+            set2 = set(list2)
+            list2 = list(set2)
+            list2.sort()
+
+            list3 = list(set1 - set2)
+            list3.sort()
+
+            # list4 = list(set2 - set1)
+            # list4.sort()
+            # print(list3)
+            # print(list4)
+
+            max_count = len(list1)
+            list1.extend([''] * (max_count - len(list1)))
+            list2.extend([''] * (max_count - len(list2)))
+            list3.extend([''] * (max_count - len(list3)))
+            # list4.extend([''] * (max_count - len(list4)))
+
+            result_df[str(wind_name) + '字段'] = list2
+            result_df[str(wind_name) + '比完整列名少字段'] = list3
+            # result_df['风机' + str(wind_name) + '_比风机1多字段'] = list4
+
+        file_name = 'col_compare.csv' if not is_falut else 'col_compare_fault.csv'
+
+        result_df.to_csv(file_name, encoding='utf-8', index=False)
+
+
+if __name__ == '__main__':
+    begin = datetime.datetime.now()
+    dir2 = r'D:\data\新华水电\风机SCADA数据'
+    files2 = read_excel_files(dir2)
+
+    other_dfs = dict()
+    list1 = list()
+    for file in files2:
+        month = path.basename(path.dirname(path.dirname(file)))[0:2]
+        wind_name = month + path.basename(path.dirname(file)).split('#')[0] + '号风机'
+        df = read_file_to_df(file, nrows=1)
+        if boolean_is_check_data(df.columns):
+            list1.extend([str(i).split('_')[-1] for i in list(df.columns) if i != 'sheet_name'])
+        if wind_name in other_dfs.keys():
+            other_dfs[wind_name].append(df)
+        else:
+            other_dfs[wind_name] = [df]
+
+    # with multiprocessing.Pool(10) as pool:
+    #     df2s = pool.starmap(read_file_to_df, [(file, list(), None, 1) for file in files2])
+    #
+    list1 = [i for i in list(set(list1)) if i != 'sheet_name']
+    compareTwoFolders(list1, other_dfs)
+
+    print(datetime.datetime.now() - begin)

+ 35 - 0
tmp_file/白玉山限电损失.py

@@ -0,0 +1,35 @@
+from os import *
+
+import pandas as pd
+
+read_path = r'D:\data\白玉山后评估数据资料\需要整理的数据\每月发电量和限电量、限电率'
+
+all_paths = list()
+for root, dirs, files in walk(read_path):
+    if files:
+        for file in files:
+            year_mont = int(file.split("(")[1].split("_")[0])
+            if year_mont >= 20230901 and year_mont < 20240901:
+                all_paths.append(path.join(root, file))
+
+df = pd.DataFrame()
+
+for path in all_paths:
+    now_df = pd.read_excel(path, usecols=['设备名称', '统计时间', '限电损失电量(kWh)'], header=2)
+    now_df = now_df[now_df['设备名称'].str.startswith("#")]
+    df = pd.concat([df, now_df])
+
+## 人工验证 看一看
+print(df[df['设备名称'] == '#34'])
+
+df = df[['设备名称', '限电损失电量(kWh)']]
+group_df = df.groupby('设备名称').sum()
+
+result_df = pd.DataFrame(group_df)
+result_df.reset_index(inplace=True)
+result_df.columns = ['设备名称', '总限电损失电量(kWh)']
+result_df.sort_values(by=['设备名称'], inplace=True)
+
+print(result_df)
+
+result_df.to_csv("设备总限电损失.csv", encoding='utf-8', index=False)

+ 98 - 0
tmp_file/陕西建工陕西智华.py

@@ -0,0 +1,98 @@
+import os
+import sys
+from concurrent.futures.thread import ThreadPoolExecutor
+
+sys.path.insert(0, os.path.abspath(__file__).split("tmp_file")[0])
+
+import datetime
+import multiprocessing
+
+import pandas as pd
+
+from utils.file.trans_methods import read_files, copy_to_new, read_excel_files, read_file_to_df
+from utils.zip.unzip import get_desc_path, unzip
+
+
+def get_real_path(win_path):
+    return win_path.replace(r'Z:', r'/data/download').replace("\\", '/')
+
+
+def unzip_or_remove(file, tmp_dir):
+    if str(file).endswith("zip"):
+        unzip(file, tmp_dir)
+    else:
+        copy_to_new(file, file.replace(file, tmp_dir))
+
+
+def read_file_to_df_and_select(file_path):
+    select_cols = ['Timestamp', 'Los', 'Distance', 'HWS(hub)', 'HWS(hub)status', 'DIR(hub)', 'DIR(hub)status']
+    df = read_file_to_df(file_path, read_cols=select_cols)
+    condition1 = df['HWS(hub)status'] > 0.8
+    condition2 = df['DIR(hub)status'] > 0.8
+    condition3 = df['Distance'].isin([70, 90])
+
+    df = df[condition1 & condition2 & condition3]
+    return df
+
+
+def read_month_data_and_select(month, files, gonglv_df):
+    with ThreadPoolExecutor(max_workers=10) as executor:
+        dfs = list(executor.map(read_file_to_df_and_select, files))
+
+    df = pd.concat(dfs, ignore_index=True)
+
+    df['Time1'] = df['Timestamp'].apply(lambda x: x.split('.')[0])
+    df['Time1'] = pd.to_datetime(df['Time1'], errors='coerce')
+    df['Time1'] = df['Time1'].apply(
+        lambda x: x + datetime.timedelta(seconds=10 - x.second % 10) if x.second % 10 != 0 else x)
+    del gonglv_df['month']
+    result_df = pd.merge(df, gonglv_df, left_on='Time1', right_on='Time1')
+    result_df.sort_values(by='Time1', inplace=True)
+    save_dir = get_real_path('Z:\偏航误差验证数据\整理结果')
+    # save_dir = r'D:\data\pianhang\result'
+    result_df.to_csv(os.path.join(save_dir, f'{month}.csv'), encoding='utf8', index=False)
+
+
+if __name__ == '__main__':
+    read_dir = 'Z:\偏航误差验证数据\新华佳县雷达数据'
+    read_dir = get_real_path(read_dir)
+
+    tmp_dir = get_real_path(r'Z:\偏航误差验证数据\tmp_data')
+    gonglv_dir = get_real_path(r'Z:\偏航误差验证数据\陕西建工陕西智华\report\output')
+
+    # read_dir = r'D:\data\pianhang\1'
+    # tmp_dir = r'D:\data\pianhang\tmp'
+    # gonglv_dir = r'D:\data\pianhang\2'
+
+    gonglv_files = read_excel_files(gonglv_dir)
+
+    with multiprocessing.Pool(20) as pool:
+        dfs = pool.starmap(read_file_to_df, [(i, ['collect_time', 'a0216']) for i in gonglv_files])
+
+    gonglv_df = pd.concat(dfs, ignore_index=True)
+    gonglv_df.columns = ['Time1', '功率']
+    gonglv_df['Time1'] = pd.to_datetime(gonglv_df['Time1'], errors='coerce')
+    gonglv_df['month'] = gonglv_df['Time1'].dt.month
+
+    all_files = read_files(tmp_dir)
+
+    all_files = [i for i in all_files if str(os.path.basename(i)).startswith('WindSpeed2024')]
+
+    # with multiprocessing.Pool(20) as pool:
+    #     pool.starmap(unzip_or_remove, [(file, tmp_dir) for file in all_files])
+
+    month_map = dict()
+    for file in all_files:
+        base_name = os.path.basename(file)
+        month = base_name[13:15]
+        if month in month_map.keys():
+            month_map[month].append(file)
+        else:
+            month_map[month] = [file]
+
+    excel_files = read_excel_files(tmp_dir)
+
+    with multiprocessing.Pool(5) as pool:
+        pool.starmap(read_month_data_and_select,
+                     [(month, files, gonglv_df[gonglv_df['month'] == int(month)]) for month, files in
+                      month_map.items()])