impellerDiameterV2.py 17 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420
  1. # impellerDiameter.py
  2. import pandas as pd
  3. import re
  4. import math
  5. def extract_diameter_and_power(model_str):
  6. """
  7. 从风机机型字符串中提取叶轮直径(单位:米)和额定功率(单位:kW)。
  8. 返回一个字典:{'diameter': 直径, 'power_kw': 功率}
  9. 如果无法确定则对应值为None。
  10. """
  11. if not isinstance(model_str, str):
  12. return {'diameter': None, 'power_kw': None}
  13. s = model_str.strip().upper()
  14. # ----- 第一步:找出所有可能的数字 -----
  15. all_numbers = []
  16. matches = re.findall(r'\d+\.?\d*', s)
  17. for num_str in matches:
  18. try:
  19. num = float(num_str)
  20. all_numbers.append(num)
  21. except ValueError:
  22. continue
  23. if len(all_numbers) < 2:
  24. # 如果没有至少两个数字,无法区分直径和功率
  25. return {'diameter': None, 'power_kw': None}
  26. # ----- 第二步:根据特征区分直径和功率 -----
  27. diameter_candidates = []
  28. power_candidates = []
  29. for num in all_numbers:
  30. # 直径的特征:通常为2-3位整数,范围在50-300米之间
  31. if 20 <= num <= 400 and num > 10: # 放宽下限到20,确保包含小直径机型
  32. # 直径通常接近整数,且数值相对较小
  33. if abs(num - round(num)) < 0.1: # 接近整数
  34. diameter_candidates.append(num)
  35. elif 50 <= num <= 300: # 在典型直径范围内的小数也可能是直径
  36. diameter_candidates.append(num)
  37. # 功率的特征:
  38. # 1. 兆瓦级的小数 (如1.5, 2.0, 3.6, 6.7)
  39. # 2. 百位或千位整数 (如1500, 2000, 3000, 5000)
  40. # 3. 万位整数 (如10000, 12000)
  41. # 判断是否为兆瓦级功率(常见的小数功率)
  42. if 0.5 <= num <= 20 and '.' in str(num):
  43. power_candidates.append(num * 1000) # 转换为kW
  44. # 判断是否为千瓦级功率
  45. elif num >= 100: # 功率通常至少100kW以上
  46. # 典型功率值范围
  47. if 100 <= num <= 30000:
  48. power_candidates.append(num)
  49. # ----- 第三步:特殊处理MW单位标识 -----
  50. # 如果字符串中包含"MW"标识,可以更准确地提取功率
  51. if 'MW' in s:
  52. # 寻找靠近"MW"的数字
  53. mw_pattern = r'(\d+\.?\d*)\s*MW'
  54. mw_matches = re.findall(mw_pattern, s)
  55. for mw_str in mw_matches:
  56. try:
  57. mw_value = float(mw_str)
  58. # 转换为kW
  59. kw_value = mw_value * 1000
  60. if kw_value not in power_candidates:
  61. power_candidates.append(kw_value)
  62. except ValueError:
  63. pass
  64. # ----- 第四步:决策逻辑 -----
  65. result = {'diameter': None, 'power_kw': None}
  66. # 1. 直径决策
  67. if diameter_candidates:
  68. # 优先选择在典型直径范围(50-200)内的整数
  69. typical_diameters = [d for d in diameter_candidates if 50 <= d <= 200]
  70. if typical_diameters:
  71. # 选择第一个(通常字符串中先出现的是直径)
  72. result['diameter'] = typical_diameters[0]
  73. else:
  74. # 如果不在典型范围,选择最小的(假设直径通常比功率数值小)
  75. result['diameter'] = min(diameter_candidates)
  76. # 2. 功率决策
  77. if power_candidates:
  78. # 优先选择通过MW标识找到的功率
  79. mw_based_power = [p for p in power_candidates if p in [num * 1000 for num in all_numbers if '.' in str(num)]]
  80. if mw_based_power:
  81. result['power_kw'] = mw_based_power[0]
  82. else:
  83. # 否则选择最大的(假设功率数值通常比直径大)
  84. # 但需要排除明显是直径的值
  85. filtered_power = [p for p in power_candidates if p != result['diameter']]
  86. if filtered_power:
  87. # 对于功率,如果是整数,优先选择常见的功率等级
  88. common_powers = [1500, 2000, 2500, 3000, 5000, 6000, 10000, 12000]
  89. for cp in common_powers:
  90. if cp in [int(p) for p in filtered_power if abs(p - round(p)) < 0.1]:
  91. result['power_kw'] = cp
  92. break
  93. if result['power_kw'] is None:
  94. result['power_kw'] = max(filtered_power)
  95. # ----- 第五步:如果决策失败,尝试基于位置的简单逻辑 -----
  96. if result['diameter'] is None or result['power_kw'] is None:
  97. if len(all_numbers) >= 2:
  98. # 假设第一个数字是直径,第二个是功率(常见格式:直径-功率)
  99. if 20 <= all_numbers[0] <= 300:
  100. result['diameter'] = all_numbers[0]
  101. # 判断第二个数字是否为功率
  102. if len(all_numbers) > 1:
  103. second_num = all_numbers[1]
  104. # 如果是小数,很可能是兆瓦级功率
  105. if '.' in str(second_num) and 0.5 <= second_num <= 20:
  106. result['power_kw'] = second_num * 1000
  107. elif second_num >= 100 and second_num <= 30000:
  108. result['power_kw'] = second_num
  109. return result
  110. def calculate_swept_area(diameter):
  111. """
  112. 计算扫风面积
  113. 公式:扫风面积 = π × (叶轮直径/2)²
  114. 单位:平方米(㎡)
  115. """
  116. if diameter is None or pd.isna(diameter):
  117. return None
  118. try:
  119. # 使用高精度的π值
  120. radius = diameter / 2.0
  121. swept_area = math.pi * (radius ** 2)
  122. return round(swept_area, 2) # 保留两位小数
  123. except (TypeError, ValueError):
  124. return None
  125. def calculate_rated_wind_speed(group):
  126. """
  127. 计算额定风速
  128. 定义:有功功率 >= 额定功率 的最小风速
  129. """
  130. if group.empty:
  131. return None
  132. # 获取该分组的额定功率(假设同一分组内额定功率相同)
  133. rated_power = group['额定功率(kW)'].iloc[0]
  134. # 如果额定功率为空,无法计算
  135. if pd.isna(rated_power):
  136. return None
  137. # 找到有功功率 >= 额定功率的数据行
  138. qualified_data = group[group['有功功率'] >= rated_power]
  139. # 如果没有满足条件的行,尝试寻找最接近额定功率的数据
  140. if qualified_data.empty:
  141. # 找到有功功率最接近额定功率的行(向上取)
  142. if group['有功功率'].max() > 0:
  143. # 计算与额定功率的绝对差值
  144. group['power_diff'] = abs(group['有功功率'] - rated_power)
  145. # 找到差值最小的行
  146. closest_row = group.loc[group['power_diff'].idxmin()]
  147. return closest_row['风速']
  148. return None
  149. # 找到最小风速
  150. rated_wind_speed = qualified_data['风速'].min()
  151. return rated_wind_speed
  152. def calculate_rated_wind_speed_for_groups(df):
  153. """
  154. 按标准机型和描述分组计算额定风速
  155. """
  156. if '标准机型' not in df.columns or '描述' not in df.columns:
  157. print("错误:数据框中缺少'标准机型'或'描述'列")
  158. return df
  159. print("正在按'标准机型'和'描述'分组计算额定风速...")
  160. # 按标准机型和描述分组
  161. groups = df.groupby(['标准机型', '描述'])
  162. # 创建一个字典来存储每个分组的额定风速
  163. rated_wind_speed_dict = {}
  164. # 计算每个分组的额定风速
  165. for (turbine_model, description), group in groups:
  166. rated_wind_speed = calculate_rated_wind_speed(group)
  167. rated_wind_speed_dict[(turbine_model, description)] = rated_wind_speed
  168. # 将额定风速添加到DataFrame中
  169. rated_wind_speeds = []
  170. for idx, row in df.iterrows():
  171. key = (row['标准机型'], row['描述'])
  172. rated_wind_speed = rated_wind_speed_dict.get(key, None)
  173. rated_wind_speeds.append(rated_wind_speed)
  174. df['额定风速(m/s)'] = rated_wind_speeds
  175. # 统计计算成功率
  176. total_groups = len(groups)
  177. successful_groups = sum(1 for v in rated_wind_speed_dict.values() if v is not None)
  178. print(f"额定风速计算完成。")
  179. print(f"分组数量:{total_groups}")
  180. print(f"成功计算额定风速的分组数:{successful_groups} ({successful_groups/total_groups*100:.1f}%)")
  181. return df
  182. def main():
  183. # ---------- 配置区:请根据您的实际文件修改 ----------
  184. input_file = f"./data/全部机型功率曲线_含标准类型.csv" # 输入文件名,支持 .csv, .xlsx, .xls
  185. output_file = f"./output/全部机型功率曲线_含标准类型_解析结果.csv" # 输出文件名
  186. model_column_name = "标准机型" # 包含机型信息的列名
  187. # -------------------------------------------------
  188. # 读取文件
  189. if input_file.endswith('.csv'):
  190. df = pd.read_csv(input_file, encoding='utf-8') # 如果编码不对,可尝试 'gbk'
  191. elif input_file.endswith(('.xlsx', '.xls')):
  192. df = pd.read_excel(input_file)
  193. else:
  194. print("错误:不支持的文件格式。请使用 .csv, .xlsx 或 .xls 文件。")
  195. return
  196. # 检查"机型"列是否存在
  197. if model_column_name not in df.columns:
  198. print(f"错误:数据框中找不到名为 '{model_column_name}' 的列。")
  199. print(f"可用的列有:{list(df.columns)}")
  200. return
  201. # 应用提取函数
  202. print("正在解析叶轮直径和额定功率...")
  203. # 创建临时列表存储结果
  204. diameters = []
  205. powers = []
  206. for model in df[model_column_name]:
  207. result = extract_diameter_and_power(model)
  208. diameters.append(result['diameter'])
  209. powers.append(result['power_kw'])
  210. # 添加到DataFrame
  211. df["叶轮直径(m)"] = diameters
  212. df["额定功率(kW)"] = powers
  213. # 计算扫风面积
  214. print("正在计算扫风面积...")
  215. df["扫风面积(㎡)"] = df["叶轮直径(m)"].apply(calculate_swept_area)
  216. # 统计提取成功率
  217. dia_success = df["叶轮直径(m)"].notna().sum()
  218. power_success = df["额定功率(kW)"].notna().sum()
  219. swept_area_success = df["扫风面积(㎡)"].notna().sum()
  220. total_count = len(df)
  221. print(f"解析完成。")
  222. print(f"叶轮直径:成功提取 {dia_success}/{total_count} 条记录 ({dia_success/total_count*100:.1f}%)")
  223. print(f"额定功率:成功提取 {power_success}/{total_count} 条记录 ({power_success/total_count*100:.1f}%)")
  224. print(f"扫风面积:成功计算 {swept_area_success}/{total_count} 条记录 ({swept_area_success/total_count*100:.1f}%)")
  225. # 显示一些功率单位的转换情况
  226. if not df["额定功率(kW)"].empty:
  227. mw_count = (df["额定功率(kW)"] % 1000 == 0).sum()
  228. print(f"其中 {mw_count} 条记录的功率由MW单位转换得到")
  229. # 按标准机型和描述分组计算额定风速
  230. df = calculate_rated_wind_speed_for_groups(df)
  231. # 保存到新文件
  232. if output_file.endswith('.csv'):
  233. df.to_csv(output_file, index=False, encoding='utf-8-sig')
  234. else:
  235. if not output_file.endswith(('.xlsx', '.xls')):
  236. output_file = output_file + '.xlsx'
  237. df.to_excel(output_file, index=False)
  238. print(f"结果已保存至:{output_file}")
  239. # 预览结果
  240. print("\n前10条记录预览:")
  241. preview_cols = [model_column_name, "叶轮直径(m)", "额定功率(kW)", "扫风面积(㎡)", "额定风速(m/s)"]
  242. preview_cols = [col for col in preview_cols if col in df.columns]
  243. print(df[preview_cols].head(10))
  244. # 显示一些统计信息
  245. print("\n解析结果统计:")
  246. if dia_success > 0:
  247. print(f"叶轮直径范围:{df['叶轮直径(m)'].min():.1f} - {df['叶轮直径(m)'].max():.1f} 米")
  248. if power_success > 0:
  249. print(f"额定功率范围:{df['额定功率(kW)'].min():.0f} - {df['额定功率(kW)'].max():.0f} kW")
  250. if swept_area_success > 0:
  251. print(f"扫风面积范围:{df['扫风面积(㎡)'].min():.0f} - {df['扫风面积(㎡)'].max():.0f} ㎡")
  252. # 显示额定风速统计
  253. if '额定风速(m/s)' in df.columns:
  254. rated_wind_success = df['额定风速(m/s)'].notna().sum()
  255. if rated_wind_success > 0:
  256. print(f"额定风速范围:{df['额定风速(m/s)'].min():.1f} - {df['额定风速(m/s)'].max():.1f} m/s")
  257. print(f"额定风速平均值:{df['额定风速(m/s)'].mean():.1f} m/s")
  258. print(f"额定风速中位数:{df['额定风速(m/s)'].median():.1f} m/s")
  259. # 显示最常见的功率等级
  260. if not df["额定功率(kW)"].empty:
  261. common_powers = df["额定功率(kW)"].dropna().astype(int).value_counts().head(5)
  262. print("\n最常见的5个额定功率等级(kW):")
  263. for power, count in common_powers.items():
  264. print(f" {power} kW: {count} 台")
  265. # 显示扫风面积与功率的关系示例
  266. print("\n扫风面积与功率关系示例(前5条有效记录):")
  267. valid_records = df[["叶轮直径(m)", "扫风面积(㎡)", "额定功率(kW)", "额定风速(m/s)"]].dropna().head(5)
  268. for idx, row in valid_records.iterrows():
  269. print(f" 直径{row['叶轮直径(m)']:.1f}m → 面积{row['扫风面积(㎡)']:.0f}㎡ → 功率{row['额定功率(kW)']:.0f}kW → 额定风速{row['额定风速(m/s)']:.1f}m/s")
  270. def calculate_additional_stats(df):
  271. """
  272. 计算额外的统计指标(可选功能)
  273. """
  274. if "叶轮直径(m)" in df.columns and "扫风面积(㎡)" in df.columns:
  275. # 计算单位扫风面积的功率密度
  276. df_valid = df.dropna(subset=["叶轮直径(m)", "扫风面积(㎡)", "额定功率(kW)"])
  277. if len(df_valid) > 0:
  278. print("\n功率密度分析(W/㎡):")
  279. df_valid["功率密度(W/㎡)"] = (df_valid["额定功率(kW)"] * 1000) / df_valid["扫风面积(㎡)"]
  280. print(f"平均功率密度:{df_valid['功率密度(W/㎡)'].mean():.2f} W/㎡")
  281. print(f"功率密度范围:{df_valid['功率密度(W/㎡)'].min():.2f} - {df_valid['功率密度(W/㎡)'].max():.2f} W/㎡")
  282. # 添加功率密度列到原始DataFrame
  283. df["功率密度(W/㎡)"] = (df["额定功率(kW)"] * 1000) / df["扫风面积(㎡)"]
  284. # 计算额定风速与功率的关系
  285. if '额定风速(m/s)' in df.columns and '额定功率(kW)' in df.columns:
  286. df_valid_wind = df.dropna(subset=["额定风速(m/s)", "额定功率(kW)"])
  287. if len(df_valid_wind) > 0:
  288. print("\n额定风速与功率关系分析:")
  289. # 按额定功率分组计算平均额定风速
  290. power_groups = df_valid_wind.groupby(pd.cut(df_valid_wind['额定功率(kW)'],
  291. bins=[0, 1000, 2000, 3000, 5000, 10000, 30000]))
  292. print("按功率区间分组的平均额定风速:")
  293. for power_range, group in power_groups:
  294. if len(group) > 0:
  295. avg_wind_speed = group['额定风速(m/s)'].mean()
  296. print(f" {power_range}: {avg_wind_speed:.1f} m/s (样本数: {len(group)})")
  297. def analyze_rated_wind_speed_by_model(df):
  298. """
  299. 按机型分析额定风速
  300. """
  301. if '标准机型' not in df.columns or '额定风速(m/s)' not in df.columns:
  302. return
  303. # 过滤出有额定风速的数据
  304. df_valid = df.dropna(subset=['标准机型', '额定风速(m/s)'])
  305. if len(df_valid) == 0:
  306. print("\n没有可用的额定风速数据进行机型分析")
  307. return
  308. print("\n按机型分析额定风速:")
  309. # 按机型分组计算统计信息
  310. model_stats = df_valid.groupby('标准机型')['额定风速(m/s)'].agg([
  311. ('平均额定风速', 'mean'),
  312. ('最小额定风速', 'min'),
  313. ('最大额定风速', 'max'),
  314. ('标准差', 'std'),
  315. ('样本数', 'count')
  316. ]).round(2)
  317. # 按样本数排序
  318. model_stats = model_stats.sort_values('样本数', ascending=False)
  319. print("各机型额定风速统计(按样本数排序):")
  320. for model, stats in model_stats.head(10).iterrows(): # 显示前10个机型
  321. print(f" 机型: {model}")
  322. print(f" 样本数: {stats['样本数']}, 平均额定风速: {stats['平均额定风速']} m/s")
  323. print(f" 范围: {stats['最小额定风速']} - {stats['最大额定风速']} m/s, 标准差: {stats['标准差']} m/s")
  324. print()
  325. if __name__ == "__main__":
  326. main()
  327. # 可选:运行额外统计功能
  328. print("\n" + "="*60)
  329. print("运行额外统计功能...")
  330. print("="*60)
  331. # 重新加载数据以进行额外分析
  332. try:
  333. df_result = pd.read_csv(f"./output/全部机型功率曲线_含标准类型_解析结果.csv", encoding='utf-8-sig')
  334. # 计算额外统计
  335. calculate_additional_stats(df_result)
  336. # 按机型分析额定风速
  337. analyze_rated_wind_speed_by_model(df_result)
  338. # 可选:保存带有额外统计的结果
  339. extra_output_file = f"./output/全部机型功率曲线_含标准类型_解析结果_详细.csv"
  340. df_result.to_csv(extra_output_file, index=False, encoding='utf-8-sig')
  341. print(f"\n详细统计结果已保存至:{extra_output_file}")
  342. except Exception as e:
  343. print(f"运行额外统计功能时出错: {e}")