|
@@ -0,0 +1,196 @@
|
|
|
+# -*- coding: utf-8 -*-
|
|
|
+# @Time : 2024/5/16
|
|
|
+# @Author : 魏志亮
|
|
|
+import ast
|
|
|
+import datetime
|
|
|
+import os
|
|
|
+import shutil
|
|
|
+import warnings
|
|
|
+
|
|
|
+import chardet
|
|
|
+import pandas as pd
|
|
|
+
|
|
|
+from utils.log.import_data_log import log_print
|
|
|
+
|
|
|
+warnings.filterwarnings("ignore")
|
|
|
+
|
|
|
+
|
|
|
+# 获取文件编码
|
|
|
+def detect_file_encoding(filename):
|
|
|
+ # 读取文件的前1000个字节(足够用于大多数编码检测)
|
|
|
+ with open(filename, 'rb') as f:
|
|
|
+ rawdata = f.read(1000)
|
|
|
+ result = chardet.detect(rawdata)
|
|
|
+ encoding = result['encoding']
|
|
|
+
|
|
|
+ log_print("文件类型:", filename, encoding)
|
|
|
+
|
|
|
+ if encoding is None:
|
|
|
+ encoding = 'gb18030'
|
|
|
+
|
|
|
+ if encoding.lower() in ['utf-8', 'ascii', 'utf8', 'utf-8-sig']:
|
|
|
+ return 'utf-8'
|
|
|
+
|
|
|
+ return 'gb18030'
|
|
|
+
|
|
|
+
|
|
|
+def del_blank(df=pd.DataFrame(), cols=list()):
|
|
|
+ for col in cols:
|
|
|
+ if df[col].dtype == object:
|
|
|
+ df[col] = df[col].str.strip()
|
|
|
+ return df
|
|
|
+
|
|
|
+
|
|
|
+# 切割数组到多个数组
|
|
|
+def split_array(array, num):
|
|
|
+ return [array[i:i + num] for i in range(0, len(array), num)]
|
|
|
+
|
|
|
+
|
|
|
+def find_header(file_path, use_cols=list()):
|
|
|
+ df = read_file_to_df(file_path, None, None, 50)
|
|
|
+ count = 0
|
|
|
+ header = None
|
|
|
+ for index, row in df.iterrows():
|
|
|
+ values = row.values
|
|
|
+ for col in use_cols:
|
|
|
+ if col in values:
|
|
|
+ count = count + 1
|
|
|
+ if count > 2:
|
|
|
+ header = index + 1
|
|
|
+ break
|
|
|
+
|
|
|
+ return header
|
|
|
+
|
|
|
+
|
|
|
+# 读取数据到df
|
|
|
+def read_file_to_df(file_path, use_cols=list(), header=None, nrows=None):
|
|
|
+ begin = datetime.datetime.now()
|
|
|
+ log_print('开始读取文件', file_path)
|
|
|
+ base_name = os.path.basename(file_path)
|
|
|
+ df = pd.DataFrame()
|
|
|
+ try:
|
|
|
+ if str(file_path).lower().endswith("csv") or str(file_path).lower().endswith("gz"):
|
|
|
+ encoding = detect_file_encoding(file_path)
|
|
|
+ end_with_gz = str(file_path).lower().endswith("gz")
|
|
|
+ if end_with_gz:
|
|
|
+ df = pd.read_csv(file_path, encoding=encoding, usecols=use_cols, compression='gzip',
|
|
|
+ header=header, nrows=nrows)
|
|
|
+ else:
|
|
|
+ df = pd.read_csv(file_path, encoding=encoding, usecols=use_cols, header=header,
|
|
|
+ on_bad_lines='warn', nrows=nrows)
|
|
|
+
|
|
|
+ else:
|
|
|
+ xls = pd.ExcelFile(file_path)
|
|
|
+ # 获取所有的sheet名称
|
|
|
+ sheet_names = xls.sheet_names
|
|
|
+ for sheet_name in sheet_names:
|
|
|
+ if use_cols:
|
|
|
+ now_df = pd.read_excel(xls, sheet_name=sheet_name, header=header, usecols=use_cols, nrows=nrows)
|
|
|
+ else:
|
|
|
+ now_df = pd.read_excel(xls, sheet_name=sheet_name, header=header, nrows=nrows)
|
|
|
+
|
|
|
+ now_df['sheet_name'] = sheet_name
|
|
|
+ df = pd.concat([df, now_df])
|
|
|
+ xls.close()
|
|
|
+ df['file_name'] = base_name[:str(base_name).rfind(".")]
|
|
|
+ log_print('文件读取成功:', file_path, '数据数量:', df.shape, '耗时:', datetime.datetime.now() - begin)
|
|
|
+ except Exception as e:
|
|
|
+ log_print('读取文件出错', file_path, str(e))
|
|
|
+ message = '文件:' + os.path.basename(file_path) + ',' + str(e)
|
|
|
+ raise ValueError(message)
|
|
|
+
|
|
|
+ return df
|
|
|
+
|
|
|
+
|
|
|
+def __build_directory_dict(directory_dict, path, filter_types=None):
|
|
|
+ # 遍历目录下的所有项
|
|
|
+ for item in os.listdir(path):
|
|
|
+ item_path = os.path.join(path, item)
|
|
|
+ if os.path.isdir(item_path):
|
|
|
+ __build_directory_dict(directory_dict, item_path, filter_types=filter_types)
|
|
|
+ elif os.path.isfile(item_path):
|
|
|
+ if path not in directory_dict:
|
|
|
+ directory_dict[path] = []
|
|
|
+
|
|
|
+ if filter_types is None or len(filter_types) == 0:
|
|
|
+ directory_dict[path].append(item_path)
|
|
|
+ elif str(item_path).split(".")[-1] in filter_types:
|
|
|
+ if str(item_path).count("~$") == 0:
|
|
|
+ directory_dict[path].append(item_path)
|
|
|
+
|
|
|
+
|
|
|
+# 读取路径下所有的excel文件
|
|
|
+def read_excel_files(read_path, filter_types=None):
|
|
|
+ if filter_types is None:
|
|
|
+ filter_types = ['xls', 'xlsx', 'csv', 'gz']
|
|
|
+ if os.path.isfile(read_path):
|
|
|
+ return [read_path]
|
|
|
+
|
|
|
+ directory_dict = {}
|
|
|
+ __build_directory_dict(directory_dict, read_path, filter_types=filter_types)
|
|
|
+
|
|
|
+ return [path for paths in directory_dict.values() for path in paths if path]
|
|
|
+
|
|
|
+
|
|
|
+# 读取路径下所有的文件
|
|
|
+def read_files(read_path, filter_types=None):
|
|
|
+ if filter_types is None:
|
|
|
+ filter_types = ['xls', 'xlsx', 'csv', 'gz', 'zip', 'rar']
|
|
|
+ if os.path.isfile(read_path):
|
|
|
+ return [read_path]
|
|
|
+ directory_dict = {}
|
|
|
+ __build_directory_dict(directory_dict, read_path, filter_types=filter_types)
|
|
|
+
|
|
|
+ return [path1 for paths in directory_dict.values() for path1 in paths if path1]
|
|
|
+
|
|
|
+
|
|
|
+def copy_to_new(from_path, to_path):
|
|
|
+ is_file = False
|
|
|
+ if to_path.count('.') > 0:
|
|
|
+ is_file = True
|
|
|
+
|
|
|
+ create_file_path(to_path, is_file_path=is_file)
|
|
|
+
|
|
|
+ shutil.copy(from_path, to_path)
|
|
|
+
|
|
|
+
|
|
|
+# 创建路径
|
|
|
+def create_file_path(read_path, is_file_path=False):
|
|
|
+ """
|
|
|
+ 创建路径
|
|
|
+ :param read_path:创建文件夹的路径
|
|
|
+ :param is_file_path: 传入的path是否包含具体的文件名
|
|
|
+ """
|
|
|
+ if is_file_path:
|
|
|
+ read_path = os.path.dirname(read_path)
|
|
|
+
|
|
|
+ if not os.path.exists(read_path):
|
|
|
+ os.makedirs(read_path, exist_ok=True)
|
|
|
+
|
|
|
+
|
|
|
+def valid_eval(eval_str):
|
|
|
+ """
|
|
|
+ 验证 eval 是否包含非法的参数
|
|
|
+ """
|
|
|
+ safe_param = ["column", "wind_name", "df", "error_time", "str", "int"]
|
|
|
+ eval_str_names = [node.id for node in ast.walk(ast.parse(eval_str)) if isinstance(node, ast.Name)]
|
|
|
+ if not set(eval_str_names).issubset(safe_param):
|
|
|
+ raise NameError(
|
|
|
+ eval_str + " contains unsafe name :" + str(','.join(list(set(eval_str_names) - set(safe_param)))))
|
|
|
+ return True
|
|
|
+
|
|
|
+
|
|
|
+if __name__ == '__main__':
|
|
|
+ # aa = valid_eval("column[column.find('_')+1:]")
|
|
|
+ # print(aa)
|
|
|
+ #
|
|
|
+ # aa = valid_eval("df['123'].apply(lambda wind_name: wind_name.replace('元宝山','').replace('号风机',''))")
|
|
|
+ # print(aa)
|
|
|
+ #
|
|
|
+ # aa = valid_eval("'记录时间' if column == '时间' else column;from os import *; path")
|
|
|
+ # print(aa)
|
|
|
+
|
|
|
+ df = read_file_to_df(r"D:\data\11-12月.xls",
|
|
|
+ trans_cols=['风机', '时间', '有功功率', '无功功率', '功率因数', '频率'], nrows=30)
|
|
|
+
|
|
|
+ print(df.columns)
|