import copy import multiprocessing import warnings from os import * import chardet import pandas as pd warnings.filterwarnings("ignore") # read_path = r'/home/wzl/test_data/红阳' # save_dir = r'/home/wzl/test_data/整理' read_path = r'D:\data\红阳\红阳秒级分测点\红阳' save_dir = r'D:\data\红阳\红阳秒级分测点\整理' def __build_directory_dict(directory_dict, path, filter_types=None): # 遍历目录下的所有项 for item in listdir(path): item_path = path.join(path, item) if path.isdir(item_path): __build_directory_dict(directory_dict, item_path, filter_types=filter_types) elif path.isfile(item_path): if path not in directory_dict: directory_dict[path] = [] if filter_types is None or len(filter_types) == 0: directory_dict[path].append(item_path) elif str(item_path).split(".")[-1] in filter_types: if str(item_path).count("~$") == 0: directory_dict[path].append(item_path) # 读取路径下所有的excel文件 def read_excel_files(read_path): if path.isfile(read_path): return [read_path] directory_dict = {} __build_directory_dict(directory_dict, read_path, filter_types=['xls', 'xlsx', 'csv', 'gz']) return [path for paths in directory_dict.values() for path in paths if path] all_files = read_excel_files(read_path) # 获取文件编码 def detect_file_encoding(filename): # 读取文件的前1000个字节(足够用于大多数编码检测) with open(filename, 'rb') as f: rawdata = f.read(1000) result = chardet.detect(rawdata) encoding = result['encoding'] if encoding is None: encoding = 'gb18030' if encoding.lower() in ['utf-8', 'ascii', 'utf8']: return 'utf-8' return 'gb18030' def read_and_organize(file): df = pd.read_csv(file, encoding=detect_file_encoding(file)) return file, df if __name__ == '__main__': with multiprocessing.Pool(10) as pool: bak_datas = pool.starmap(read_and_organize, [(i,) for i in all_files]) datas = copy.deepcopy(bak_datas) wind_name_df = dict() for file, df in datas: all_cols = [i for i in df.columns if i.find('#') > -1] col = all_cols[0] cedian = str(col).split("_")[-1] wind_names = set([str(i).split("#")[0].replace("红阳风电场_", "") for i in all_cols]) print(file, df.columns) for wind_name in wind_names: cols = [i for i in all_cols if i.find('_' + wind_name) > -1] cols.insert(0, '统计时间') query_df = df[cols] query_df.columns = [str(i).split('_')[-1] for i in query_df.columns] query_df['风机编号'] = wind_name if wind_name in wind_name_df.keys(): now_df = wind_name_df[wind_name] wind_name_df[wind_name] = pd.merge(now_df, query_df, on=['统计时间', '风机编号'], how='outer') else: wind_name_df[wind_name] = query_df for wind_name, df in wind_name_df.items(): df.to_csv(path.join(save_dir, wind_name + '#.csv'), index=False, encoding='utf8')