python分析:中国五⼤城市PM2.5数据分析```code
import csv
import os
import numpy as np
import config
def load_data(data_file, usecols):
"""
参数:
- data_file: ⽂件路径
- usecols: 所使⽤的列
返回:
-
data_arr: 数据的多维数组表⽰
"""
data = []
with open(data_file, 'r') as csvfile:
data_reader = csv.DictReader(csvfile)
print(data_reader)
# === Step 2. 数据处理 ===
for row in data_reader:
# 取出每⾏数据,组合为⼀个列表放⼊数据列表中
row_data = []
# 注意csv模块读⼊的数据全部为字符串类型
for col in usecols:
str_val = row[col]
# 数据类型转换为float,如果是'NA',则返回nan
row_data.append(float(str_val) if str_val != 'NA' else np.nan)
# 如果⾏数据中不包含nan才保存该⾏记录
if not any(np.isnan(row_data)):
data.append(row_data)
# 将data转换为ndarray
data_arr = np.array(data)
return data_arr
def get_polluted_perc(data_arr):
""" 获取污染占⽐的⼩时数
规则:
重度污染(heavy) PM2.5 > 150
中度污染(medium) 75 < PM2.5 <= 150
轻度污染(light) 35 < PM2.5 <= 75
优良空⽓(good) PM2.5 <= 35
参数:
- data_arr: 数据的多维数组表⽰
返回:
- polluted_perc_list: 污染⼩时数百分⽐列表
"""
# 将每个区的PM值平均后作为该城市⼩时的PM值
# 按⾏取平均值
hour_val = np.mean(data_arr[:, 2:], axis=1)
# 总⼩时数
n_hours = hour_val.shape[0]
# 重度污染⼩时数
n_heavy_hours = hour_val[hour_val > 150].shape[0]
# 中度污染⼩时数
n_medium_hours = hour_val[(hour_val > 75) & (hour_val <= 150)].shape[0]
# 轻度污染⼩时数
n_light_hours = hour_val[(hour_val > 35) & (hour_val <= 75)].shape[0]
# 优良空⽓⼩时数
n_good_hours = hour_val[hour_val <= 35].shape[0]
polluted_perc_list = [n_heavy_hours / n_hours, n_medium_hours / n_hours,
n_light_hours / n_hours, n_good_hours / n_hours]
return polluted_perc_list
def get_avg_pm_per_month(data_arr):
"""
获取每个区每⽉的平均PM值
参数:
- data_arr: 数据的多维数组表⽰
返回:
-
results_arr: 多维数组结果
"""
results = []
# 获取年份
years = np.unique(data_arr[:, 0])
for year in years:
# 获取当前年份数据getsavefilename
year_data_arr = data_arr[data_arr[:, 0] == year]
# 获取数据的⽉份
month_list = np.unique(year_data_arr[:, 1])
for month in month_list:
# 获取⽉份的所有数据
month_data_arr = year_data_arr[year_data_arr[:, 1] == month]
# 计算当前⽉份PM的均值
mean_vals = np.mean(month_data_arr[:, 2:], axis=0).tolist()
# 格式化字符串
row_data = ['{:.0f}-{:02.0f}'.format(year, month)] + mean_vals
results.append(row_data)
results_arr = np.array(results)
return results_arr
def save_stats_to_csv(results_arr, save_file, headers):
"""
将统计结果保存⾄csv⽂件中
参数:
- results_arr: 多维数组结果
- save_file: ⽂件保存路径
- headers: csv表头
"""
with open(save_file, 'w', newline='') as csvfile:
writer = csv.writer(csvfile)
writer.writerow(headers)
for row in list():
writer.writerow(row)
def main():
"""
主函数
"""
polluted_state_list = []
for city_name, (filename, cols) in config.data_config_dict.items():
# === Step 1+2. 数据获取 + 数据处理 ===
data_file = os.path.join(config.dataset_path, filename)
usecols = configmon_cols + ['PM_' + col for col in cols]
data_arr = load_data(data_file, usecols)
print('{}共有{}⾏有效数据'.format(city_name, data_arr.shape[0]))
# 预览前10⾏数据
print('{}的前10⾏数据:'.format(city_name))
print(data_arr[:10])
# === Step 3. 数据分析 ===
# 五城市污染状态,统计污染⼩时数的占⽐
polluted_perc_list = get_polluted_perc(data_arr)
polluted_state_list.append([city_name] + polluted_perc_list)
print('{}的污染⼩时数百分⽐{}'.format(city_name, polluted_perc_list))
# 五城市每个区空⽓质量的⽉度差异,分析计算每个⽉,每个区的平均PM值results_arr = get_avg_pm_per_month(data_arr)
print('{}的每⽉平均PM值预览:'.format(city_name))
print(results_arr[:10])
# === Step 4. 结果展⽰ ===
# 4.1 保存⽉度统计结果⾄csv⽂件
save_filename = city_name + '_month_stats.csv'
save_file = os.path.join(config.output_path, save_filename)
save_stats_to_csv(results_arr, save_file, headers=['month'] + cols) print('⽉度统计结果已保存⾄{}'.format(save_file))
print()
# 4.2 污染状态结果保存
save_file = os.path.join(config.output_path, 'polluted_percentage.csv') with open(save_file, 'w', newline='') as csvfile:
writer = csv.writer(csvfile)
writer.writerow(['city', 'heavy', 'medium', 'light', 'good'])
for row in polluted_state_list:
writer.writerow(row)
print('污染状态结果已保存⾄{}'.format(save_file))
if __name__ == '__main__':
main()
```
版权声明:本站内容均来自互联网,仅供演示用,请勿用于商业和其他非法用途。如果侵犯了您的权益请与我们联系QQ:729038198,我们将在24小时内删除。
发表评论