1234567891011121314151617181920212223242526272829303132333435363738394041424344454647 |
- import os
- import chardet
- import pandas as pd
- # 获取文件编码
- def detect_file_encoding(filename):
- # 读取文件的前1000个字节(足够用于大多数编码检测)
- with open(filename, 'rb') as f:
- rawdata = f.read(1000)
- result = chardet.detect(rawdata)
- encoding = result['encoding']
- if encoding is None:
- encoding = 'gb18030'
- if encoding.lower() in ['utf-8', 'ascii', 'utf8', 'utf-8-sig']:
- return 'utf-8'
- return 'gb18030'
- def read_file_to_df(file_path, nrows=None):
- df = pd.DataFrame()
- try:
- if str(file_path).lower().endswith("csv"):
- encoding = detect_file_encoding(file_path)
- df = pd.read_csv(file_path, encoding=encoding, on_bad_lines='warn', nrows=nrows)
- else:
- xls = pd.ExcelFile(file_path)
- sheet_names = xls.sheet_names
- for sheet_name in sheet_names:
- now_df = pd.read_excel(xls, sheet_name=sheet_name, nrows=nrows)
- now_df['sheet_name'] = sheet_name
- df = pd.concat([df, now_df])
- xls.close()
- except Exception as e:
- message = '文件:' + os.path.basename(file_path) + ',' + str(e)
- raise ValueError(message)
- return df
- if __name__ == '__main__':
- df = read_file_to_df(r"D:\data\11-12月.xls")
- print(df)
|