|
@@ -0,0 +1,202 @@
|
|
|
+# -*- coding: utf-8 -*-
|
|
|
+# @Time : 2024/5/16
|
|
|
+# @Author : 魏志亮
|
|
|
+import datetime
|
|
|
+import os
|
|
|
+import shutil
|
|
|
+import warnings
|
|
|
+
|
|
|
+import chardet
|
|
|
+import pandas as pd
|
|
|
+
|
|
|
+from utils.log.trans_log import trans_print
|
|
|
+
|
|
|
+warnings.filterwarnings("ignore")
|
|
|
+
|
|
|
+
|
|
|
+# 获取文件编码
|
|
|
+def detect_file_encoding(filename):
|
|
|
+ # 读取文件的前1000个字节(足够用于大多数编码检测)
|
|
|
+ with open(filename, 'rb') as f:
|
|
|
+ rawdata = f.read(1000)
|
|
|
+ result = chardet.detect(rawdata)
|
|
|
+ encoding = result['encoding']
|
|
|
+
|
|
|
+ trans_print("文件类型:", filename, encoding)
|
|
|
+
|
|
|
+ if encoding is None:
|
|
|
+ encoding = 'gb18030'
|
|
|
+
|
|
|
+ if encoding.lower() in ['utf-8', 'ascii', 'utf8']:
|
|
|
+ return 'utf-8'
|
|
|
+
|
|
|
+ return 'gb18030'
|
|
|
+
|
|
|
+
|
|
|
+def del_blank(df=pd.DataFrame(), cols=list()):
|
|
|
+ for col in cols:
|
|
|
+ if df[col].dtype == object:
|
|
|
+ df[col] = df[col].str.strip()
|
|
|
+ return df
|
|
|
+
|
|
|
+
|
|
|
+# 切割数组到多个数组
|
|
|
+def split_array(array, num):
|
|
|
+ return [array[i:i + num] for i in range(0, len(array), num)]
|
|
|
+
|
|
|
+
|
|
|
+def find_read_header(file_path, trans_cols):
|
|
|
+ df = read_file_to_df(file_path, nrows=20)
|
|
|
+ count = 0
|
|
|
+ header = None
|
|
|
+ for col in trans_cols:
|
|
|
+ if col in df.columns:
|
|
|
+ count = count + 1
|
|
|
+ if count >= 2:
|
|
|
+ header = 0
|
|
|
+ break
|
|
|
+
|
|
|
+ count = 0
|
|
|
+
|
|
|
+ values = list()
|
|
|
+ for index, row in df.iterrows():
|
|
|
+ values = list(row.values)
|
|
|
+ if type(row.name) == tuple:
|
|
|
+ values.extend(list(row.name))
|
|
|
+ for col in trans_cols:
|
|
|
+ if col in values:
|
|
|
+ count = count + 1
|
|
|
+ if count > 2:
|
|
|
+ header = index + 1
|
|
|
+ break
|
|
|
+
|
|
|
+ read_cols = []
|
|
|
+ for col in values:
|
|
|
+ if col in trans_cols:
|
|
|
+ read_cols.append(col)
|
|
|
+
|
|
|
+ return header, read_cols
|
|
|
+
|
|
|
+
|
|
|
+# 读取数据到df
|
|
|
+def read_file_to_df(file_path, read_cols=list(), trans_cols=None, nrows=None):
|
|
|
+ begin = datetime.datetime.now()
|
|
|
+ trans_print('开始读取文件', file_path)
|
|
|
+ header = 0
|
|
|
+ find_cols = list()
|
|
|
+ if trans_cols:
|
|
|
+ header, find_cols = find_read_header(file_path, trans_cols)
|
|
|
+ trans_print(os.path.basename(file_path), "读取第", header, "行")
|
|
|
+ if header is None:
|
|
|
+ message = '未匹配到开始行,请检查并重新指定'
|
|
|
+ trans_print(message)
|
|
|
+ raise Exception(message)
|
|
|
+
|
|
|
+ read_cols.extend(find_cols)
|
|
|
+
|
|
|
+ try:
|
|
|
+ df = pd.DataFrame()
|
|
|
+ if str(file_path).lower().endswith("csv") or str(file_path).lower().endswith("gz"):
|
|
|
+ encoding = detect_file_encoding(file_path)
|
|
|
+ end_with_gz = str(file_path).lower().endswith("gz")
|
|
|
+ if read_cols:
|
|
|
+ if end_with_gz:
|
|
|
+ df = pd.read_csv(file_path, encoding=encoding, usecols=read_cols, compression='gzip', header=header,
|
|
|
+ nrows=nrows)
|
|
|
+ else:
|
|
|
+ df = pd.read_csv(file_path, encoding=encoding, usecols=read_cols, header=header,
|
|
|
+ on_bad_lines='warn', nrows=nrows)
|
|
|
+ else:
|
|
|
+
|
|
|
+ if end_with_gz:
|
|
|
+ df = pd.read_csv(file_path, encoding=encoding, compression='gzip', header=header, nrows=nrows)
|
|
|
+ else:
|
|
|
+ df = pd.read_csv(file_path, encoding=encoding, header=header, on_bad_lines='warn', nrows=nrows)
|
|
|
+
|
|
|
+ else:
|
|
|
+ xls = pd.ExcelFile(file_path, engine="calamine")
|
|
|
+ # 获取所有的sheet名称
|
|
|
+ sheet_names = xls.sheet_names
|
|
|
+ for sheet_name in sheet_names:
|
|
|
+ if read_cols:
|
|
|
+ now_df = pd.read_excel(xls, sheet_name=sheet_name, header=header, usecols=read_cols, nrows=nrows)
|
|
|
+ else:
|
|
|
+ now_df = pd.read_excel(xls, sheet_name=sheet_name, header=header, nrows=nrows)
|
|
|
+
|
|
|
+ now_df['sheet_name'] = sheet_name
|
|
|
+ df = pd.concat([df, now_df])
|
|
|
+ xls.close()
|
|
|
+ trans_print('文件读取成功:', file_path, '数据数量:', df.shape, '耗时:', datetime.datetime.now() - begin)
|
|
|
+ except Exception as e:
|
|
|
+ trans_print('读取文件出错', file_path, str(e))
|
|
|
+ message = '文件:' + os.path.basename(file_path) + ',' + str(e)
|
|
|
+ raise ValueError(message)
|
|
|
+
|
|
|
+ return df
|
|
|
+
|
|
|
+
|
|
|
+def __build_directory_dict(directory_dict, path, filter_types=None):
|
|
|
+ # 遍历目录下的所有项
|
|
|
+ for item in os.listdir(path):
|
|
|
+ item_path = os.path.join(path, item)
|
|
|
+ if os.path.isdir(item_path):
|
|
|
+ __build_directory_dict(directory_dict, item_path, filter_types=filter_types)
|
|
|
+ elif os.path.isfile(item_path):
|
|
|
+ if path not in directory_dict:
|
|
|
+ directory_dict[path] = []
|
|
|
+
|
|
|
+ if filter_types is None or len(filter_types) == 0:
|
|
|
+ directory_dict[path].append(item_path)
|
|
|
+ elif str(item_path).split(".")[-1] in filter_types:
|
|
|
+ if str(item_path).count("~$") == 0:
|
|
|
+ directory_dict[path].append(item_path)
|
|
|
+
|
|
|
+
|
|
|
+# 读取路径下所有的excel文件
|
|
|
+def read_excel_files(read_path):
|
|
|
+ if os.path.isfile(read_path):
|
|
|
+ return [read_path]
|
|
|
+
|
|
|
+ directory_dict = {}
|
|
|
+ __build_directory_dict(directory_dict, read_path, filter_types=['xls', 'xlsx', 'csv', 'gz'])
|
|
|
+
|
|
|
+ return [path for paths in directory_dict.values() for path in paths if path]
|
|
|
+
|
|
|
+
|
|
|
+# 读取路径下所有的文件
|
|
|
+def read_files(read_path):
|
|
|
+ directory_dict = {}
|
|
|
+ __build_directory_dict(directory_dict, read_path, filter_types=['xls', 'xlsx', 'csv', 'gz', 'zip', 'rar'])
|
|
|
+
|
|
|
+ return [path for paths in directory_dict.values() for path in paths if path]
|
|
|
+
|
|
|
+
|
|
|
+def copy_to_new(from_path, to_path):
|
|
|
+ is_file = False
|
|
|
+ if to_path.count('.') > 0:
|
|
|
+ is_file = True
|
|
|
+
|
|
|
+ create_file_path(to_path, is_file_path=is_file)
|
|
|
+
|
|
|
+ shutil.copy(from_path, to_path)
|
|
|
+
|
|
|
+
|
|
|
+# 创建路径
|
|
|
+def create_file_path(path, is_file_path=False):
|
|
|
+ if is_file_path:
|
|
|
+ path = os.path.dirname(path)
|
|
|
+
|
|
|
+ if not os.path.exists(path):
|
|
|
+ os.makedirs(path, exist_ok=True)
|
|
|
+
|
|
|
+
|
|
|
+if __name__ == '__main__':
|
|
|
+ datas = read_excel_files(r"D:\data\清理数据\招远风电场\WOF053600062-WOB000009_ZYFDC000012\minute")
|
|
|
+ for data in datas:
|
|
|
+ print(data)
|
|
|
+
|
|
|
+ print("*" * 20)
|
|
|
+
|
|
|
+ datas = read_excel_files(r"D:\data\清理数据\招远风电场\WOF053600062-WOB000009_ZYFDC000012\minute\WOG00066.csv.gz")
|
|
|
+ for data in datas:
|
|
|
+ print(data)
|