1 tahun lalu · 286ae47332
--- a/app_run.py
+++ b/app_run.py
@@ -49,7 +49,11 @@ if __name__ == '__main__':
 
															     if len(sys.argv) >= 2:
														
 
															         env = sys.argv[1]
														
 
															-    conf_path = path.abspath(f"./conf/etl_config_{env}.yaml")
														
 
															+    if env.endswith('.yaml'):
														
 
															+        conf_path = env
														
 
															+    else:
														
 
															+        conf_path = path.abspath(f"./conf/etl_config_{env}.yaml")
														
 
															+
														
 
															     environ['ETL_CONF'] = conf_path
														
 
															     yaml_config = yaml_conf(conf_path)
														
 
															     environ['env'] = env
														
--- a/conf/etl_config_tidbprod.yaml
+++ b/conf/etl_config_tidbprod.yaml
@@ -0,0 +1,31 @@
 
															+plt:
														
 
															+  database: energy
														
 
															+  host: 192.168.50.234
														
 
															+  password: '123456'
														
 
															+  port: 4000
														
 
															+  user: root
														
 
															+
														
 
															+trans:
														
 
															+  database: energy_data_prod
														
 
															+  host: 192.168.50.235
														
 
															+  password: admin123456
														
 
															+  port: 4000
														
 
															+  user: root
														
 
															+
														
 
															+# 如果要放在原始路径,则配置这个 以下面的名称作为切割点,新建清理数据文件夹
														
 
															+etl_origin_path_contain: 收资数据
														
 
															+# 如果单独保存,配置这个路径
														
 
															+save_path:
														
 
															+
														
 
															+# 日志保存路径
														
 
															+log_path_dir: /data/logs/no_batch_trans_tidb
														
 
															+
														
 
															+# 临时文件存放处,有些甲方公司隔得tmp太小,只好自己配置
														
 
															+tmp_base_path: /data/download/collection_data/tmp
														
 
															+
														
 
															+run_batch_count: 2
														
 
															+
														
 
															+archive_path: /data/download/collection_data/archive/prod_ti_db
														
 
															+
														
 
															+
														
 
															+use_tidb: True
														
--- a/etl/common/PathsAndTable.py
+++ b/etl/common/PathsAndTable.py
@@ -22,6 +22,9 @@ class PathsAndTable(object):
 
															         self.wind_col_trans = wind_col_trans
														
 
															         save_path_conf = read_conf(yaml_config, "save_path")
														
 
															+
														
 
															+        self.use_tidb = read_conf(yaml_config, 'use_tidb', False)
														
 
															+
														
 
															         self.tmp_base_path = read_conf(yaml_config, "tmp_base_path", "/tmp")
														
 
															         if save_path_conf:
														
 
															             self.save_path = save_path_conf + sep + self.wind_farm_name
														
@@ -73,7 +76,7 @@ class PathsAndTable(object):
 
															         if self.save_db:
														
 
															             trans_print("开始创建表")
														
 
															             if self.read_type in ['second', 'minute']:
														
 
															-                creat_min_sec_table(self.get_table_name(), self.read_type)
														
 
															+                creat_min_sec_table(self.get_table_name(), self.read_type, self.use_tidb)
														
 
															             elif self.read_type in ['fault', 'warn']:
														
 
															                 create_warn_fault_table(self.get_table_name())
														
 
															             else:
														
--- a/etl/common/SaveToDb.py
+++ b/etl/common/SaveToDb.py
@@ -4,7 +4,7 @@ import traceback
 
															 from etl.common.PathsAndTable import PathsAndTable
														
 
															 from service.trans_conf_service import update_trans_transfer_progress
														
 
															-from service.trans_service import save_partation_file_to_db, save_file_to_db
														
 
															+from service.trans_service import save_scada_file_to_db, save_file_to_db
														
 
															 from utils.file.trans_methods import split_array
														
 
															 from utils.log.trans_log import trans_print
														
 
															 from utils.systeminfo.sysinfo import get_available_cpu_count_with_percent
														
@@ -32,13 +32,13 @@ class SaveToDb(object):
 
															         all_arrays = split_array(all_saved_files, split_count)
														
 
															         try:
														
 
															             for index, arr in enumerate(all_arrays):
														
 
															-                with multiprocessing.Pool(split_count) as pool:
														
 
															+                with multiprocessing.Pool(10) as pool:
														
 
															                     if self.pathsAndTable.read_type in ['minute', 'second']:
														
 
															-                        pool.starmap(save_partation_file_to_db,
														
 
															+                        pool.starmap(save_scada_file_to_db,
														
 
															                                      [(self.pathsAndTable.get_table_name(), file,
														
 
															                                        self.pathsAndTable.wind_col_trans[os.path.basename(file).split(".")[0]],
														
 
															                                        os.path.basename(os.path.dirname(file)),
														
 
															-                                       self.batch_count) for file in arr])
														
 
															+                                       self.batch_count,self.pathsAndTable.use_tidb) for file in arr])
														
 
															                     else:
														
 
															                         pool.starmap(save_file_to_db,
														
 
															                                      [(self.pathsAndTable.get_table_name(), file, self.batch_count) for file in arr])
														
--- a/etl/wind_power/min_sec/ReadAndSaveTmp.py
+++ b/etl/wind_power/min_sec/ReadAndSaveTmp.py
@@ -244,9 +244,15 @@ class ReadAndSaveTmp(object):
 
															                                      resolve_col_prefix=self.trans_param.resolve_col_prefix)
														
 
															             else:
														
 
															                 if self.trans_param.need_valid_cols:
														
 
															-                    df = read_file_to_df(file_path, read_cols, trans_cols=trans_cols)
														
 
															+                    if self.trans_param.resolve_col_prefix:
														
 
															+                        df = read_file_to_df(file_path, trans_cols=trans_cols,
														
 
															+                                             resolve_col_prefix=self.trans_param.resolve_col_prefix)
														
 
															+                    else:
														
 
															+                        df = read_file_to_df(file_path, read_cols, trans_cols=trans_cols,
														
 
															+                                             resolve_col_prefix=self.trans_param.resolve_col_prefix)
														
 
															                 else:
														
 
															-                    df = read_file_to_df(file_path, trans_cols=trans_cols)
														
 
															+                    df = read_file_to_df(file_path, trans_cols=trans_cols,
														
 
															+                                         resolve_col_prefix=self.trans_param.resolve_col_prefix)
														
 
															             # 处理列名前缀问题
														
 
															             if self.trans_param.resolve_col_prefix:
														
@@ -360,6 +366,6 @@ class ReadAndSaveTmp(object):
 
															         trans_print("开始保存数据到临时文件")
														
 
															         begin = datetime.datetime.now()
														
 
															         self.read_file_and_save_tmp()
														
 
															-        update_trans_transfer_progress(self.pathsAndTable.id,  50,
														
 
															+        update_trans_transfer_progress(self.pathsAndTable.id, 50,
														
 
															                                        self.pathsAndTable.save_db)
														
 
															         trans_print("保存数据到临时文件结束,耗时:", datetime.datetime.now() - begin)
														
--- a/etl/wind_power/wave/WaveTrans.py
+++ b/etl/wind_power/wave/WaveTrans.py
@@ -1,17 +1,18 @@
 
															-import datetime
														
 
															 import json
														
 
															 import multiprocessing
														
 
															+import traceback
														
 
															 from service.plt_service import get_all_wind
														
 
															+from service.trans_conf_service import update_trans_status_running, update_trans_transfer_progress, \
														
 
															+    update_trans_status_success, update_trans_status_error
														
 
															 from service.trans_service import get_wave_conf, save_df_to_db, get_or_create_wave_table, \
														
 
															     get_wave_data, delete_exist_wave_data
														
 
															-from service.trans_conf_service import update_trans_status_running, update_trans_transfer_progress, \
														
 
															-    update_trans_status_success
														
 
															 from utils.file.trans_methods import *
														
 
															 from utils.log.trans_log import set_trance_id
														
 
															 from utils.systeminfo.sysinfo import get_available_cpu_count_with_percent
														
 
															 exec("from os.path import *")
														
 
															+exec("import re")
														
 
															 class WaveTrans(object):
														
@@ -46,8 +47,7 @@ class WaveTrans(object):
 
															         update_trans_status_running(self.id)
														
 
															         trance_id = '-'.join([self.wind_farm_code, 'wave'])
														
 
															         set_trance_id(trance_id)
														
 
															-        all_files = read_files(self.read_dir, ['txt'])
														
 
															-        self.data_count = len(all_files)
														
 
															+        all_files = read_files(self.read_dir, ['txt', 'csv'])
														
 
															         update_trans_transfer_progress(self.id, 5)
														
 
															         # 最大取系统cpu的 1/2
														
 
															         split_count = get_available_cpu_count_with_percent(1 / 2)
														
@@ -68,7 +68,7 @@ class WaveTrans(object):
 
															         mesure_poins = [key for key, value in wave_conf.items() if str(key).startswith('conf_') and value]
														
 
															         for point in mesure_poins:
														
 
															-            map_dict[wave_conf[point]] = point.replace('conf_', '')
														
 
															+            map_dict[wave_conf[point].strip()] = point.replace('conf_', '')
														
 
															         wind_turbine_name_set = set()
														
@@ -77,8 +77,15 @@ class WaveTrans(object):
 
															         for index, now_array in enumerate(all_array):
														
 
															             index_begin = datetime.datetime.now()
														
 
															             with multiprocessing.Pool(split_count) as pool:
														
 
															-                file_datas = pool.starmap(self.get_data_exec,
														
 
															-                                          [(base_param_exec, i, list(map_dict.keys())) for i in now_array])
														
 
															+                try:
														
 
															+                    file_datas = pool.starmap(self.get_data_exec,
														
 
															+                                              [(base_param_exec, i, list(map_dict.keys())) for i in now_array])
														
 
															+                    trans_print(f'总数:{len(now_array)},返回个数{len(file_datas)}')
														
 
															+                except Exception as e:
														
 
															+                    message = str(e)
														
 
															+                    trans_print(traceback.format_exc())
														
 
															+                    update_trans_status_error(self.id, message[0:len(message) if len(message) < 100 else 100])
														
 
															+                    raise e
														
 
															             update_trans_transfer_progress(self.id, 20 + int(index / total_index * 60))
														
 
															             trans_print("读取文件耗时:", datetime.datetime.now() - self.begin)
														
@@ -102,13 +109,13 @@ class WaveTrans(object):
 
															                              mesure_data])
														
 
															             if result_list:
														
 
															+                self.data_count = self.data_count + len(result_list)
														
 
															                 df = pd.DataFrame(result_list,
														
 
															                                   columns=['wind_turbine_name', 'time_stamp', 'rotational_speed', 'sampling_frequency',
														
 
															                                            'mesure_point_name', 'type', 'mesure_data'])
														
 
															                 df['time_stamp'] = pd.to_datetime(df['time_stamp'], errors='coerce')
														
 
															                 df['mesure_point_name'] = df['mesure_point_name'].map(map_dict)
														
 
															                 df.dropna(subset=['mesure_point_name'], inplace=True)
														
 
															-
														
 
															                 df['wind_turbine_number'] = df['wind_turbine_name'].map(all_wind).fillna(df['wind_turbine_name'])
														
 
															                 df['mesure_data'] = df['mesure_data'].apply(lambda x: json.dumps(x))
														
--- a/nutika_package.sh
+++ b/nutika_package.sh
@@ -1,3 +1,3 @@
 
															 #!/bin/bash
														
 
															-nuitka --standalone --onefile --include-data-files=./conf/*=./conf/  --output-dir=/home/wzl/project/install_package --remove-output app_run.py
														
 
															+nuitka --standalone --onefile --static-libpython=yes  --include-data-files=./conf/*=./conf/  --output-dir=/home/wzl/project/install_package --remove-output app_run.py
														
--- a/requirements.txt
+++ b/requirements.txt
@@ -4,7 +4,6 @@ PyMySQL~=1.1.0
 
															 SQLAlchemy~=2.0.30
														
 
															 rarfile~=4.2
														
 
															 PyYAML~=6.0.1
														
 
															-matplotlib~=3.9.0
														
 
															 chardet~=3.0.4
														
 
															 psutil~=6.0.0
														
 
															 openpyxl ~= 3.1.4
														
--- a/service/plt_service.py
+++ b/service/plt_service.py
@@ -39,3 +39,11 @@ def get_base_wind_and_power(wind_turbine_number):
 
															         return None
														
 
															     return dict_datas
														
 
															+
														
 
															+if __name__ == '__main__':
														
 
															+    from os import path,environ
														
 
															+    env = 'prod'
														
 
															+    conf_path = path.abspath(f"./conf/etl_config_{env}.yaml")
														
 
															+    environ['ETL_CONF'] = conf_path
														
 
															+    environ['env'] = env
														
 
															+    print(get_all_wind('WOF039800012'))
														
--- a/service/trans_service.py
+++ b/service/trans_service.py
@@ -6,10 +6,10 @@ from os import *
 
															 import pandas as pd
														
 
															+from service.common_connect import trans
														
 
															 from service.trans_conf_service import create_wave_table
														
 
															 from utils.file.trans_methods import split_array
														
 
															 from utils.log.trans_log import trans_print
														
 
															-from service.common_connect import trans
														
 
															 def get_min_sec_conf(field_code, trans_type) -> dict:
														
@@ -58,7 +58,7 @@ def get_wave_conf(field_code) -> dict:
 
															     return res[0]
														
 
															-def creat_min_sec_table(table_name, trans_type):
														
 
															+def creat_min_sec_table(table_name, trans_type, use_tidb=False):
														
 
															     exists_table_sql = f"""
														
 
															     select count(1) as count from information_schema.tables where table_schema = '{trans.database}' and table_name = '{table_name}'
														
 
															     """
														
@@ -135,7 +135,10 @@ def creat_min_sec_table(table_name, trans_type):
 
															              KEY `time_stamp` (`time_stamp`),
														
 
															              KEY `wind_turbine_number` (`wind_turbine_number`),
														
 
															              {add_key}
														
 
															-        )
														
 
															+        ) 
														
 
															+        """
														
 
															+        # if not use_tidb:
														
 
															+        create_sql = create_sql + f"""
														
 
															         PARTITION BY LIST COLUMNS ({key}, `wind_turbine_number`) (
														
 
															         PARTITION pDefault VALUES IN ((000000, 'wind_turbine_number'))
														
 
															         ) 
														
@@ -177,18 +180,37 @@ def add_or_remove_partation(table_name: str, date_str: str, wind_turbine_number)
 
															         add_partation(table_name, date_str, wind_turbine_number)
														
 
															-def save_partation_file_to_db(table_name: str, file: str, wind_turbine_number, date_str, batch_count=100000):
														
 
															+def drop_exists_data(table_name, wind_turbine_number, min_date, max_date):
														
 
															+    # sql = f"# delete from {table_name} where wind_turbine_number = '{wind_turbine_number}' and time_stamp between '{min_date}' and '{max_date}'"
														
 
															+
														
 
															+    sql = f"""
														
 
															+    BATCH ON `time_stamp`, `wind_turbine_number` LIMIT 1000 
														
 
															+    DELETE FROM `{table_name}` 
														
 
															+    WHERE `rated_at` >= "{min_date}" 
														
 
															+    AND `rated_at` <= "{max_date}"
														
 
															+    AND `wind_turbine_number` = "{wind_turbine_number}";
														
 
															+    """
														
 
															+
														
 
															+    count = trans.execute(sql)
														
 
															+    trans_print(f"删除数据{count}条，{table_name},{wind_turbine_number},{min_date},{max_date}")
														
 
															+
														
 
															+
														
 
															+def save_scada_file_to_db(table_name, file: str, wind_turbine_number, date_str, batch_count=100000, use_tidb=False):
														
 
															     base_name = path.basename(file)
														
 
															-    # wind_turbine_number = path.basename(file).split(".")[0]
														
 
															-    # date_str = path.basename(path.dirname(file))
														
 
															+    df = pd.read_csv(file)
														
 
															+    # if use_tidb:
														
 
															+    #     min_date = df['time_stamp'].min()
														
 
															+    #     max_date = df['time_stamp'].max()
														
 
															+    #     # drop_exists_data(table_name, wind_turbine_number, min_date, max_date)
														
 
															+    # else:
														
 
															+    #     add_or_remove_partation(table_name, date_str, wind_turbine_number)
														
 
															     add_or_remove_partation(table_name, date_str, wind_turbine_number)
														
 
															     try:
														
 
															-        for i, df in enumerate(pd.read_csv(file, chunksize=batch_count)):
														
 
															-            trans.execute_df_save(df, table_name)
														
 
															-            count = (i + 1) * batch_count
														
 
															-            trans_print(base_name, f"Chunk {count} written to MySQL.")
														
 
															+        trans_print(f"保存{table_name},{base_name},{wind_turbine_number},数据：{df.shape[0]}")
														
 
															+        trans.execute_df_save(df, table_name, batch_count)
														
 
															+        trans_print(f"保存到{table_name},{base_name},{wind_turbine_number} 成功,总条数:{df.shape[0]}")
														
 
															     except Exception as e:
														
 
															         trans_print(traceback.format_exc())
														
 
															         message = base_name + str(e)
														
@@ -198,11 +220,10 @@ def save_partation_file_to_db(table_name: str, file: str, wind_turbine_number, d
 
															 def save_file_to_db(table_name: str, file: str, batch_count=100000):
														
 
															     base_name = path.basename(file)
														
 
															     try:
														
 
															-        for i, df in enumerate(pd.read_csv(file, chunksize=batch_count)):
														
 
															-            # df.to_sql(table_name, engine, if_exists='append', index=False)
														
 
															-            trans.execute_df_save(df, table_name)
														
 
															-            count = (i + 1) * batch_count
														
 
															-            trans_print(base_name, f"Chunk {count} written to MySQL.")
														
 
															+        df = pd.read_csv(file)
														
 
															+        trans_print(f"保存{table_name},总条数：{df.shape[0]}")
														
 
															+        trans.execute_df_save(df, table_name, batch_count)
														
 
															+        trans_print(f"保存到{table_name}成功,总条数:{df.shape[0]}")
														
 
															     except Exception as e:
														
 
															         trans_print(traceback.format_exc())
														
 
															         message = base_name + str(e)
														
@@ -210,12 +231,10 @@ def save_file_to_db(table_name: str, file: str, batch_count=100000):
 
															 def save_df_to_db(table_name: str, df: pd.DataFrame(), batch_count=100000):
														
 
															-    split_dfs = [df.iloc[i:i + batch_count] for i in range(0, len(df), batch_count)]
														
 
															     try:
														
 
															-        for i, split_df in enumerate(split_dfs):
														
 
															-            trans.execute_df_save(split_df, table_name)
														
 
															-            count = (i + 1) * batch_count
														
 
															-            trans_print(f"Chunk {count} written to MySQL.")
														
 
															+        trans_print(f"保存{table_name},总条数：{df.shape[0]}")
														
 
															+        trans.execute_df_save(df, table_name, batch_count)
														
 
															+        trans_print(f"保存到{table_name}成功,总条数:{df.shape[0]}")
														
 
															     except Exception as e:
														
 
															         trans_print(traceback.format_exc())
														
 
															         raise Exception(str(e))
														
--- a/utils/db/ConnectMysql.py
+++ b/utils/db/ConnectMysql.py
@@ -48,8 +48,8 @@ class ConnectMysql:
 
															         dbname = config['database']
														
 
															         return create_engine(f'mysql+pymysql://{username}:{password}@{host}:{port}/{dbname}')
														
 
															-    def execute_df_save(self, df, table_name):
														
 
															-        df.to_sql(table_name, self.get_engine(), index=False, if_exists='append')
														
 
															+    def execute_df_save(self, df, table_name, chunk_size=10000):
														
 
															+        df.to_sql(table_name, self.get_engine(), index=False, if_exists='append', chunksize=chunk_size)
														
 
															     def read_sql_to_df(self, sql):
														
 
															         df = pd.read_sql_query(sql, self.get_engine())
														
--- a/utils/db/ConnectMysql_tidb_fix.py
+++ b/utils/db/ConnectMysql_tidb_fix.py
@@ -0,0 +1,80 @@
 
															+import time
														
 
															+import traceback
														
 
															+from os import *
														
 
															+
														
 
															+import pandas as pd
														
 
															+import pymysql
														
 
															+from pymysql.cursors import DictCursor
														
 
															+from sqlalchemy import create_engine
														
 
															+
														
 
															+from utils.conf.read_conf import yaml_conf
														
 
															+from utils.log.trans_log import trans_print
														
 
															+
														
 
															+
														
 
															+class ConnectMysql:
														
 
															+
														
 
															+    def __init__(self, connet_name):
														
 
															+        self.yaml_data = yaml_conf(environ.get('ETL_CONF'))
														
 
															+        self.connet_name = connet_name
														
 
															+        self.config = self.yaml_data[self.connet_name]
														
 
															+        self.database = self.config['database']
														
 
															+
														
 
															+    # 从连接池中获取一个连接
														
 
															+    def get_conn(self):
														
 
															+        return pymysql.connect(**self.config, autocommit=True)
														
 
															+
														
 
															+    # 使用连接执行sql
														
 
															+    def execute(self, sql, params=tuple()):
														
 
															+
														
 
															+        with self.get_conn() as conn:
														
 
															+            with conn.cursor(cursor=DictCursor) as cursor:
														
 
															+                try:
														
 
															+                    cursor.execute(sql, params)
														
 
															+                    trans_print("开始执行SQL:", cursor._executed)
														
 
															+                    conn.commit()
														
 
															+                    result = cursor.fetchall()
														
 
															+                    return result
														
 
															+                except Exception as e:
														
 
															+                    trans_print(f"执行sql：{sql}，报错：{e}")
														
 
															+                    trans_print(traceback.format_exc())
														
 
															+                    conn.rollback()
														
 
															+                    raise e
														
 
															+
														
 
															+    def get_engine(self):
														
 
															+        config = self.config
														
 
															+        username = config['user']
														
 
															+        password = config['password']
														
 
															+        host = config['host']
														
 
															+        port = config['port']
														
 
															+        dbname = config['database']
														
 
															+        return create_engine(f'mysql+pymysql://{username}:{password}@{host}:{port}/{dbname}',
														
 
															+                             pool_pre_ping=True,
														
 
															+                             isolation_level="READ COMMITTED",
														
 
															+                             connect_args={
														
 
															+                                 'connect_timeout': 30,
														
 
															+                                 'read_timeout': 120,
														
 
															+                                 'write_timeout': 7200
														
 
															+                             })
														
 
															+
														
 
															+    def execute_df_save(self, df, table_name, chunksize=10000):
														
 
															+        engine = self.get_engine()
														
 
															+        try:
														
 
															+            retry_count = 0
														
 
															+            max_retries = 3
														
 
															+            while retry_count < max_retries:
														
 
															+                try:
														
 
															+                    df.to_sql(table_name, engine, if_exists='append', index=False, chunksize=chunksize)
														
 
															+                except Exception as e:
														
 
															+                    retry_count += 1
														
 
															+                    trans_print(f" 第 {retry_count} 次重试, 错误: {str(e)}")
														
 
															+                    time.sleep(5 * retry_count)  # 指数退避
														
 
															+                    if retry_count == max_retries:
														
 
															+                        trans_print(f"处理失败: {str(e)}")
														
 
															+                        raise
														
 
															+        except Exception as e:
														
 
															+            engine.dispose()
														
 
															+            raise e
														
 
															+
														
 
															+    def read_sql_to_df(self, sql):
														
 
															+        df = pd.read_sql_query(sql, self.get_engine())
														
 
															+        return df
														
--- a/utils/draw/draw_file.py
+++ b/utils/draw/draw_file.py
@@ -1,81 +1,81 @@
 
															-import matplotlib
														
 
															-
														
 
															-from utils.file.trans_methods import create_file_path
														
 
															-
														
 
															-matplotlib.use('Agg')
														
 
															-matplotlib.rcParams['font.family'] = 'SimHei'
														
 
															-matplotlib.rcParams['font.sans-serif'] = ['SimHei']
														
 
															-matplotlib.rcParams['axes.unicode_minus'] = False
														
 
															-from matplotlib import pyplot as plt
														
 
															-
														
 
															-
														
 
															-def scatter(title, x_label, y_label, x_values, y_values, color=None, col_map=dict(), size=10,
														
 
															-            save_file_path=''):
														
 
															-    if save_file_path:
														
 
															-        create_file_path(save_file_path, True)
														
 
															-    else:
														
 
															-        save_file_path = title + '.png'
														
 
															-
														
 
															-    plt.figure(figsize=(8, 6))
														
 
															-    plt.title(title, fontsize=16)
														
 
															-    plt.xlabel(x_label, fontsize=14)
														
 
															-    plt.ylabel(y_label, fontsize=14)
														
 
															-    if color is not None:
														
 
															-        plt.scatter(x_values, y_values, s=size, c=color)
														
 
															-        if col_map:
														
 
															-            patches = [plt.Rectangle((0, 0), 1, 1, fc=c) for c in col_map.values()]
														
 
															-            plt.legend(patches, list(col_map.keys()))
														
 
															-    else:
														
 
															-        plt.scatter(x_values, y_values, s=size)
														
 
															-
														
 
															-    plt.savefig(save_file_path)
														
 
															-    plt.close()
														
 
															-
														
 
															-
														
 
															-if __name__ == '__main__':
														
 
															-    import pandas as pd
														
 
															-    import numpy as np
														
 
															-    from matplotlib import pyplot as plt
														
 
															-
														
 
															-    df = pd.read_csv(r"/home/wzl/test_data/2024_10_17_14_54_46_200k_Root.csv")
														
 
															-    df.reset_index(inplace=True, drop=True)
														
 
															-    df.columns = ['data']
														
 
															-
														
 
															-    # Calculate the moving average with a window of 3 (1 before, 1 after)
														
 
															-    window_size = 20
														
 
															-    moving_avg = df['data'].rolling(window=window_size).mean()
														
 
															-    df['moving_avg'] = moving_avg
														
 
															-    # Calculate the percentage difference
														
 
															-    percentage_diff = abs((df['data'] - moving_avg) / moving_avg) * 100
														
 
															-    df['percentage_diff'] = percentage_diff
														
 
															-    # Flag values that differ by more than threshold
														
 
															-    threshold = 3
														
 
															-    df['is_anomaly'] = percentage_diff < threshold
														
 
															-
														
 
															-    avg = df['data'].mean()
														
 
															-    df['avg']=df['data'] > avg
														
 
															-
														
 
															-
														
 
															-    difference_ratio = df.iloc[window_size:]
														
 
															-    difference_ratio.reset_index(inplace=True)
														
 
															-    # 创建图形和轴对象
														
 
															-    plt.figure(figsize=(10, 6))
														
 
															-    colors = np.where((difference_ratio['is_anomaly'] == True) & (difference_ratio['avg'] == True), 'r', np.where((difference_ratio['is_anomaly'] == False) & (difference_ratio['avg'] == False), 'g', 'b'))
														
 
															-
														
 
															-    datas = difference_ratio['data'].values
														
 
															-    # for i in range(len(datas)):
														
 
															-    #     plt.plot(i, datas[i], marker='o', color=colors[i])
														
 
															-
														
 
															-    plt.figure(figsize=(10, 6))
														
 
															-    plt.scatter([i for i in range(len(datas))], datas,  c=colors)
														
 
															-
														
 
															-    # 添加标题和标签
														
 
															-    plt.title('Difference Ratio of Each Data Point to Its Previous 10 Data Points Mean')
														
 
															-    plt.xlabel('Index')
														
 
															-    plt.ylabel('Difference Ratio')
														
 
															-
														
 
															-    # 显示网格
														
 
															-    plt.grid(True)
														
 
															-
														
 
															-    # 显示图形
														
 
															-    plt.show()
														
 
															+# import matplotlib
														
 
															+#
														
 
															+# from utils.file.trans_methods import create_file_path
														
 
															+#
														
 
															+# matplotlib.use('Agg')
														
 
															+# matplotlib.rcParams['font.family'] = 'SimHei'
														
 
															+# matplotlib.rcParams['font.sans-serif'] = ['SimHei']
														
 
															+# matplotlib.rcParams['axes.unicode_minus'] = False
														
 
															+# from matplotlib import pyplot as plt
														
 
															+#
														
 
															+#
														
 
															+# def scatter(title, x_label, y_label, x_values, y_values, color=None, col_map=dict(), size=10,
														
 
															+#             save_file_path=''):
														
 
															+#     if save_file_path:
														
 
															+#         create_file_path(save_file_path, True)
														
 
															+#     else:
														
 
															+#         save_file_path = title + '.png'
														
 
															+#
														
 
															+#     plt.figure(figsize=(8, 6))
														
 
															+#     plt.title(title, fontsize=16)
														
 
															+#     plt.xlabel(x_label, fontsize=14)
														
 
															+#     plt.ylabel(y_label, fontsize=14)
														
 
															+#     if color is not None:
														
 
															+#         plt.scatter(x_values, y_values, s=size, c=color)
														
 
															+#         if col_map:
														
 
															+#             patches = [plt.Rectangle((0, 0), 1, 1, fc=c) for c in col_map.values()]
														
 
															+#             plt.legend(patches, list(col_map.keys()))
														
 
															+#     else:
														
 
															+#         plt.scatter(x_values, y_values, s=size)
														
 
															+#
														
 
															+#     plt.savefig(save_file_path)
														
 
															+#     plt.close()
														
 
															+#
														
 
															+#
														
 
															+# if __name__ == '__main__':
														
 
															+#     import pandas as pd
														
 
															+#     import numpy as np
														
 
															+#     from matplotlib import pyplot as plt
														
 
															+#
														
 
															+#     df = pd.read_csv(r"/home/wzl/test_data/2024_10_17_14_54_46_200k_Root.csv")
														
 
															+#     df.reset_index(inplace=True, drop=True)
														
 
															+#     df.columns = ['data']
														
 
															+#
														
 
															+#     # Calculate the moving average with a window of 3 (1 before, 1 after)
														
 
															+#     window_size = 20
														
 
															+#     moving_avg = df['data'].rolling(window=window_size).mean()
														
 
															+#     df['moving_avg'] = moving_avg
														
 
															+#     # Calculate the percentage difference
														
 
															+#     percentage_diff = abs((df['data'] - moving_avg) / moving_avg) * 100
														
 
															+#     df['percentage_diff'] = percentage_diff
														
 
															+#     # Flag values that differ by more than threshold
														
 
															+#     threshold = 3
														
 
															+#     df['is_anomaly'] = percentage_diff < threshold
														
 
															+#
														
 
															+#     avg = df['data'].mean()
														
 
															+#     df['avg']=df['data'] > avg
														
 
															+#
														
 
															+#
														
 
															+#     difference_ratio = df.iloc[window_size:]
														
 
															+#     difference_ratio.reset_index(inplace=True)
														
 
															+#     # 创建图形和轴对象
														
 
															+#     plt.figure(figsize=(10, 6))
														
 
															+#     colors = np.where((difference_ratio['is_anomaly'] == True) & (difference_ratio['avg'] == True), 'r', np.where((difference_ratio['is_anomaly'] == False) & (difference_ratio['avg'] == False), 'g', 'b'))
														
 
															+#
														
 
															+#     datas = difference_ratio['data'].values
														
 
															+#     # for i in range(len(datas)):
														
 
															+#     #     plt.plot(i, datas[i], marker='o', color=colors[i])
														
 
															+#
														
 
															+#     plt.figure(figsize=(10, 6))
														
 
															+#     plt.scatter([i for i in range(len(datas))], datas,  c=colors)
														
 
															+#
														
 
															+#     # 添加标题和标签
														
 
															+#     plt.title('Difference Ratio of Each Data Point to Its Previous 10 Data Points Mean')
														
 
															+#     plt.xlabel('Index')
														
 
															+#     plt.ylabel('Difference Ratio')
														
 
															+#
														
 
															+#     # 显示网格
														
 
															+#     plt.grid(True)
														
 
															+#
														
 
															+#     # 显示图形
														
 
															+#     plt.show()