curge_read.py 1.3 KB

1234567891011121314151617181920212223242526272829303132333435363738394041424344454647
  1. import os
  2. import chardet
  3. import pandas as pd
  4. # 获取文件编码
  5. def detect_file_encoding(filename):
  6. # 读取文件的前1000个字节(足够用于大多数编码检测)
  7. with open(filename, 'rb') as f:
  8. rawdata = f.read(1000)
  9. result = chardet.detect(rawdata)
  10. encoding = result['encoding']
  11. if encoding is None:
  12. encoding = 'gb18030'
  13. if encoding.lower() in ['utf-8', 'ascii', 'utf8', 'utf-8-sig']:
  14. return 'utf-8'
  15. return 'gb18030'
  16. def read_file_to_df(file_path, nrows=None):
  17. df = pd.DataFrame()
  18. try:
  19. if str(file_path).lower().endswith("csv"):
  20. encoding = detect_file_encoding(file_path)
  21. df = pd.read_csv(file_path, encoding=encoding, on_bad_lines='warn', nrows=nrows)
  22. else:
  23. xls = pd.ExcelFile(file_path)
  24. sheet_names = xls.sheet_names
  25. for sheet_name in sheet_names:
  26. now_df = pd.read_excel(xls, sheet_name=sheet_name, nrows=nrows)
  27. now_df['sheet_name'] = sheet_name
  28. df = pd.concat([df, now_df])
  29. xls.close()
  30. except Exception as e:
  31. message = '文件:' + os.path.basename(file_path) + ',' + str(e)
  32. raise ValueError(message)
  33. return df
  34. if __name__ == '__main__':
  35. df = read_file_to_df(r"D:\data\11-12月.xls")
  36. print(df)