张崾先震动_参数获取.py 2.8 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990
  1. import datetime
  2. import multiprocessing
  3. import os.path
  4. import pandas as pd
  5. def __build_directory_dict(directory_dict, path, filter_types=None):
  6. # 遍历目录下的所有项
  7. for item in os.listdir(path):
  8. item_path = os.path.join(path, item)
  9. if os.path.isdir(item_path):
  10. __build_directory_dict(directory_dict, item_path, filter_types=filter_types)
  11. elif os.path.isfile(item_path):
  12. if path not in directory_dict:
  13. directory_dict[path] = []
  14. if filter_types is None or len(filter_types) == 0:
  15. directory_dict[path].append(item_path)
  16. elif str(item_path).split(".")[-1] in filter_types:
  17. if str(item_path).count("~$") == 0:
  18. directory_dict[path].append(item_path)
  19. # 读取路径下所有的excel文件
  20. def read_excel_files(read_path, filter_types=None):
  21. if filter_types is None:
  22. filter_types = ['xls', 'xlsx', 'csv', 'gz']
  23. if os.path.isfile(read_path):
  24. return [read_path]
  25. directory_dict = {}
  26. __build_directory_dict(directory_dict, read_path, filter_types=filter_types)
  27. return [path for paths in directory_dict.values() for path in paths if path]
  28. # 读取路径下所有的文件
  29. def read_files(read_path, filter_types=None):
  30. if filter_types is None:
  31. filter_types = ['xls', 'xlsx', 'csv', 'gz', 'zip', 'rar']
  32. if os.path.isfile(read_path):
  33. return [read_path]
  34. directory_dict = {}
  35. __build_directory_dict(directory_dict, read_path, filter_types=filter_types)
  36. return [path1 for paths in directory_dict.values() for path1 in paths if path1]
  37. all_files = read_files(r'G:\CMS', ['txt'])
  38. def get_line_count(file_path):
  39. with open(file_path, 'r', encoding='utf-8') as file:
  40. return sum(1 for _ in file)
  41. def read_file_and_read_count(index, file_path, datas):
  42. if index % 10000 == 0:
  43. print(datetime.datetime.now(), index)
  44. base_name = os.path.basename(file_path).split('.')[0]
  45. cols = base_name.split('_')
  46. cols.append(get_line_count(file_path))
  47. datas.append(cols)
  48. def get_name(x):
  49. result_str = ''
  50. if x['col3'] != '无':
  51. result_str += x['col3']
  52. result_str += x['col2']
  53. if x['col4'] != '无':
  54. result_str += x['col4']
  55. result_str += x['col7']
  56. return result_str
  57. if __name__ == '__main__':
  58. datas = multiprocessing.Manager().list()
  59. with multiprocessing.Pool(20) as pool:
  60. pool.starmap(read_file_and_read_count, [(i, file_path, datas) for i, file_path in enumerate(all_files)])
  61. df = pd.DataFrame(datas, columns=[f'col{i}' for i in range(10)])
  62. df['col8'] = pd.to_datetime(df['col8'], format='%Y%m%d%H%M%S', errors='coerce')
  63. df.sort_values(by=['col1', 'col8'], inplace=True)
  64. df['测点完整名称'] = df.apply(get_name, axis=1)
  65. df.to_csv('d://cms_data.csv', index=False, encoding='utf8')