Pandas DataFrame 核心操作 —— 数据读写、清洗、聚合
目标
- 掌握 DataFrame 的创建、读取、写入
- 掌握数据筛选、排序、分组聚合
- 理解链式操作与管道风格
完整代码
import pandas as pd
import numpy as np
# ============================================================
# 1. DataFrame 创建
# ============================================================
# 从字典创建
df = pd.DataFrame({
"name": ["Alice", "Bob", "Charlie", "Diana", "Eve"],
"age": [25, 32, 28, 35, 22],
"city": ["Beijing", "Shanghai", "Beijing", "Shenzhen", "Shanghai"],
"salary": [15000, 22000, 18000, 25000, 12000],
"department": ["Engineering", "Sales", "Engineering", "HR", "Sales"],
})
print(f"原始数据:\n{df}\n")
# ============================================================
# 2. 基本属性与信息
# ============================================================
print(f"形状: {df.shape}")
print(f"列名: {df.columns.tolist()}")
print(f"数据类型:\n{df.dtypes}")
print(f"描述统计:\n{df.describe()}")
print(f"\n缺失值:\n{df.isnull().sum()}")
# ============================================================
# 3. 数据筛选
# ============================================================
# 按条件筛选
engineers = df[df["department"] == "Engineering"]
print(f"Engineering 部门:\n{engineers}\n")
# 多条件(用 & 不是 and,用 | 不是 or)
high_paid = df[(df["salary"] > 15000) & (df["city"] == "Beijing")]
print(f"北京高薪员工:\n{high_paid}\n")
# query() 方法(更易读)
result = df.query("salary > 15000 and city == 'Beijing'")
print(f"query 方法:\n{result}\n")
# 按位置:iloc
print(f"前两行:\n{df.iloc[:2]}")
print(f"第1,3行和第0,2列:\n{df.iloc[[0,2], [0,2]]}\n")
# 按标签:loc
print(f"name 和 salary 列:\n{df.loc[:, ["name", "salary"]]}")
# ============================================================
# 4. 数据排序
# ============================================================
sorted_df = df.sort_values("salary", ascending=False)
print(f"\n按工资降序:\n{sorted_df}")
# 多列排序
sorted_df = df.sort_values(["department", "salary"], ascending=[True, False])
print(f"\n按部门+工资排序:\n{sorted_df}")
# ============================================================
# 5. 分组聚合(核心!)
# ============================================================
# 单列聚合
print(f"\n按部门平均工资:\n{df.groupby("department")["salary"].mean()}")
# 多列多聚合
stats = df.groupby("city").agg({
"salary": ["mean", "min", "max", "count"],
"age": "mean",
})
print(f"\n按城市多聚合:\n{stats}")
# 自定义聚合
def salary_range(x):
return x.max() - x.min()
custom_agg = df.groupby("department").agg(
avg_salary=("salary", "mean"),
total_salary=("salary", "sum"),
salary_spread=("salary", salary_range),
headcount=("name", "count"),
)
print(f"\n自定义聚合:\n{custom_agg}")
# transform —— 保持原形状
df["dept_avg"] = df.groupby("department")["salary"].transform("mean")
df["salary_pct"] = df["salary"] / df["dept_avg"] * 100
print(f"\n带部门均值:\n{df}")
# ============================================================
# 6. 链式操作(推荐风格)
# ============================================================
result = (
df
.query("age >= 25")
.groupby("city")
.agg(avg_salary=("salary", "mean"), count=("name", "count"))
.query("count >= 1")
.sort_values("avg_salary", ascending=False)
.reset_index()
)
print(f"\n链式操作结果:\n{result}")
# ============================================================
# 7. 表连接(类似 SQL JOIN)
# ============================================================
departments = pd.DataFrame({
"department": ["Engineering", "Sales", "HR", "Finance"],
"manager": ["张工", "李经理", "王主管", "赵总监"],
"floor": [3, 5, 4, 6],
})
# INNER JOIN
merged = pd.merge(df, departments, on="department", how="inner")
print(f"\nINNER JOIN:\n{merged}")
# LEFT JOIN
merged = pd.merge(df, departments, on="department", how="left")
print(f"\nLEFT JOIN:\n{merged}")
# ============================================================
# 8. 数据透视表
# ============================================================
pivot = df.pivot_table(
values="salary",
index="department",
columns="city",
aggfunc="mean",
fill_value=0,
)
print(f"\n数据透视表:\n{pivot}")
关键要点
| 操作 |
方法 |
| 筛选 |
df[condition], df.query(), df.loc[], df.iloc[] |
| 排序 |
df.sort_values() |
| 分组 |
df.groupby().agg() |
| 窗口 |
df["col"].transform() |
| 合并 |
pd.merge(), pd.concat() |
| 透视 |
df.pivot_table() |
| 链式 |
.pipe() 或在括号中连续调用 |