NumPy 数组基础 —— 创建、索引、运算
目标
- 掌握 ndarray 的创建、属性、索引与切片
- 理解向量化运算与 Python 循环的性能差异
- 掌握广播机制
完整代码
import numpy as np
# ============================================================
# 1. 数组创建
# ============================================================
# 从列表创建
a = np.array([1, 2, 3, 4, 5])
print(f"1D 数组: {a}")
b = np.array([[1, 2, 3], [4, 5, 6]])
print(f"2D 数组:\n{b}")
# 特殊数组
zeros = np.zeros((3, 4)) # 全零
ones = np.ones((2, 3)) # 全一
eye = np.eye(3) # 单位矩阵
full = np.full((2, 2), 7) # 填充指定值
print(f"单位矩阵:\n{eye}")
# 序列数组
arange = np.arange(0, 10, 2) # 步长为2
linspace = np.linspace(0, 1, 5) # 等分5份
print(f"arange: {arange}")
print(f"linspace: {linspace}")
# 随机数组
rand = np.random.rand(3, 3) # [0,1) 均匀分布
randn = np.random.randn(3, 3) # 标准正态分布
randint = np.random.randint(1, 100, size=(3, 3)) # 随机整数
# 设置随机种子(可复现)
rng = np.random.default_rng(seed=42)
print(f"可复现随机数: {rng.random(5)}")
# ============================================================
# 2. 数组属性
# ============================================================
arr = np.random.randn(4, 3, 28, 28)
print(f"\n形状: {arr.shape}") # (4, 3, 28, 28)
print(f"维度数: {arr.ndim}") # 4
print(f"元素总数: {arr.size}") # 4*3*28*28 = 9408
print(f"数据类型: {arr.dtype}") # float64
print(f"元素大小: {arr.itemsize} bytes")
print(f"总内存: {arr.nbytes} bytes ({arr.nbytes / 1024 / 1024:.2f} MB)")
# 数据类型转换
arr_f32 = arr.astype(np.float32)
print(f"float32 内存减半: {arr_f32.nbytes} bytes")
# ============================================================
# 3. 索引与切片
# ============================================================
arr = np.arange(1, 13).reshape(3, 4)
print(f"\n原始数组:\n{arr}")
# 基本切片
print(f"第2行: {arr[1]}")
print(f"第1列: {arr[:, 0]}")
print(f"子矩阵:\n{arr[0:2, 1:3]}")
# 布尔索引
mask = arr > 6
print(f"大于6的元素: {arr[mask]}")
# 花式索引
print(f"取(0,2)行和(1,3)列:\n{arr[[0, 2], :][:, [1, 3]]}")
# np.ix_ —— 笛卡尔积索引
print(f"np.ix_ 笛卡尔积:\n{arr[np.ix_([0, 2], [1, 3])]}")
# ============================================================
# 4. 形状变换
# ============================================================
a = np.arange(12)
print(f"\nreshape 3×4:\n{a.reshape(3, 4)}")
print(f"reshape 2×2×3:\n{a.reshape(2, 2, 3)}")
# -1 表示自动推断
print(f"自动推断行数:\n{a.reshape(-1, 4)}")
# 转置
m = np.arange(6).reshape(2, 3)
print(f"转置:\n{m.T}")
# 添加/删除维度
x = np.array([1, 2, 3])
print(f"np.newaxis 增加列维度: {x[:, np.newaxis].shape}") # (3, 1)
print(f"expand_dims: {np.expand_dims(x, axis=0).shape}") # (1, 3)
print(f"squeeze: {np.ones((1,3,1,5)).squeeze().shape}") # (3, 5)
# 拼接
a = np.array([[1, 2], [3, 4]])
b = np.array([[5, 6]])
print(f"\n垂直拼接:\n{np.vstack([a, b])}")
print(f"水平拼接:\n{np.hstack([a, b.T])}")
# ============================================================
# 5. 向量化运算(性能对比)
# ============================================================
import time
N = 10_000_000
arr = np.random.randn(N)
# 向量化
start = time.time()
result_np = arr * 2 + 1
print(f"\nNumPy 向量化: {time.time() - start:.4f} 秒")
# Python 循环(不要这样做!)
start = time.time()
result_py = [x * 2 + 1 for x in arr]
print(f"Python 列表推导: {time.time() - start:.4f} 秒")
# 逐元素数学运算
a = np.array([1, 2, 3, 4])
print(f"\n加法: {a + 2}")
print(f"乘法: {a * 3}")
print(f"平方: {a ** 2}")
print(f"指数: {np.exp(a)}")
print(f"对数: {np.log(a)}")
# ============================================================
# 6. 广播机制
# ============================================================
# 标量广播
arr = np.arange(6).reshape(2, 3)
print(f"\n标量广播:\n{arr + 10}")
# 向量广播
row = np.array([10, 20, 30]) # shape (3,)
print(f"向量广播(按行):\n{arr + row}")
col = np.array([[10], [20]]) # shape (2, 1)
print(f"向量广播(按列):\n{arr + col}")
# 广播规则:从后往前对齐维度,不匹配的维度必须为 1
# (2, 3) + (1, 3) → (2, 3) ✅
# (2, 3) + (2, 1) → (2, 3) ✅
# (2, 3) + (2, ) → (2, 3) ✅(自动在前面补 1)
# ============================================================
# 7. 聚合操作
# ============================================================
arr = np.random.randn(4, 5)
print(f"\n全局求和: {arr.sum():.4f}")
print(f"全局均值: {arr.mean():.4f}")
print(f"全局最大值: {arr.max():.4f}")
print(f"全局最小值: {arr.min():.4f}")
print(f"标准差: {arr.std():.4f}")
# 按轴聚合
print(f"\n按列求和 (axis=0): {arr.sum(axis=0)}") # 对每列求和
print(f"按行求和 (axis=1): {arr.sum(axis=1)}") # 对每行求和
print(f"每列均值: {arr.mean(axis=0)}")
print(f"每行最大值: {arr.max(axis=1)}")
# argmax / argmin
print(f"每列最大值索引: {arr.argmax(axis=0)}")
运行输出示例
NumPy 向量化: 0.0123 秒
Python 列表推导: 3.4567 秒
# 向量化比列表推导快 ~280 倍!
关键要点
| 概念 |
说明 |
ndarray |
同构多维数组,所有元素类型相同 |
shape |
元组,描述各维度大小 |
dtype |
数据类型,如 float64、int32 |
| 向量化 |
一次操作作用于整个数组,底层 C 实现 |
| 广播 |
不同形状数组的自动对齐运算 |
axis=0 |
沿行方向(对列操作),结果减少第0维 |
np.newaxis |
增加一个大小为1的维度 |
NumPy 线性代数与数据应用
目标
- 掌握矩阵运算、特征值分解、SVD
- 求解线性方程组
- 使用 NumPy 进行数据清洗与特征工程
- 多项式拟合与信号处理入门
完整代码
1. 矩阵运算
import numpy as np
# ============================================================
# 矩阵乘法
# ============================================================
A = np.array([[1, 2], [3, 4]])
B = np.array([[5, 6], [7, 8]])
# 三种等价写法
C1 = A @ B # Python 3.5+ 推荐
C2 = np.matmul(A, B)
C3 = np.dot(A, B)
print(f"矩阵乘法:\n{C1}")
# 逐元素乘法
print(f"逐元素乘法:\n{A * B}")
# 向量点积
a = np.array([1, 2, 3])
b = np.array([4, 5, 6])
print(f"点积: {np.dot(a, b)}") # 32
print(f"内积: {np.inner(a, b)}") # 32
print(f"外积:\n{np.outer(a, b)}") # 3×3 矩阵
# ============================================================
# 矩阵属性
# ============================================================
print(f"\n行列式: {np.linalg.det(A):.2f}")
print(f"逆矩阵:\n{np.linalg.inv(A)}")
print(f"转置: {(A @ np.linalg.inv(A)).round(10)}") # 应等于单位矩阵
# ============================================================
# 求解线性方程组 Ax = b
# ============================================================
# 3x + y = 9
# x + 2y = 8
A = np.array([[3, 1], [1, 2]])
b = np.array([9, 8])
x = np.linalg.solve(A, b)
print(f"\n方程组的解: x={x[0]:.1f}, y={x[1]:.1f}") # x=2, y=3
print(f"验证 Ax-b = {A @ x - b}") # 应接近 [0,0]
2. 特征值与特征向量
# ============================================================
# 特征值分解
# ============================================================
A = np.array([[4, 2], [1, 3]])
eigenvalues, eigenvectors = np.linalg.eig(A)
print(f"特征值: {eigenvalues}")
print(f"特征向量:\n{eigenvectors}")
# 验证: A @ v = λ @ v
for i in range(len(eigenvalues)):
v = eigenvectors[:, i]
λ = eigenvalues[i]
print(f"验证 A·v{i}: {A @ v}")
print(f"验证 λ{i}·v: {λ * v}")
print(f"误差: {np.linalg.norm(A @ v - λ * v):.2e}\n")
# ============================================================
# SVD 奇异值分解
# ============================================================
M = np.array([[1, 2, 3], [4, 5, 6]])
U, S, Vt = np.linalg.svd(M, full_matrices=False)
print(f"U (左奇异向量):\n{U}")
print(f"Σ (奇异值): {S}")
print(f"Vt (右奇异向量转置):\n{Vt}")
# 重构
M_reconstructed = U @ np.diag(S) @ Vt
print(f"重构矩阵:\n{M_reconstructed}")
print(f"重构误差: {np.linalg.norm(M - M_reconstructed):.2e}")
3. 数据清洗与统计
# ============================================================
# 处理缺失值
# ============================================================
data = np.array([1.0, 2.0, np.nan, 4.0, np.nan, 6.0])
# 检测 NaN
nan_mask = np.isnan(data)
print(f"\nNaN 位置: {nan_mask}")
print(f"NaN 数量: {np.sum(nan_mask)}")
# 填充 NaN(用均值)
clean_mean = np.where(nan_mask, np.nanmean(data), data)
print(f"均值填充: {clean_mean}")
# 填充 NaN(用前向填充)
from numpy.lib.stride_tricks import sliding_window_view
# 简化:用 np.nan_to_num
clean_zero = np.nan_to_num(data, nan=0)
print(f"零填充: {clean_zero}")
# ============================================================
# 描述性统计
# ============================================================
scores = np.array([78, 85, 92, 88, 76, 95, 89, 83, 91, 87])
print(f"\n均值: {np.mean(scores):.1f}")
print(f"中位数: {np.median(scores):.1f}")
print(f"标准差: {np.std(scores):.1f}")
print(f"方差: {np.var(scores):.1f}")
# 分位数
print(f"25%: {np.percentile(scores, 25):.1f}")
print(f"50%: {np.percentile(scores, 50):.1f}")
print(f"75%: {np.percentile(scores, 75):.1f}")
# 检测异常值(Z-score 方法)
z_scores = (scores - scores.mean()) / scores.std()
outliers = np.abs(z_scores) > 2
print(f"异常值 (|z| > 2): {scores[outliers]}")
# ============================================================
# 相关系数
# ============================================================
x = np.array([1, 2, 3, 4, 5])
y = np.array([2, 4, 5, 4, 6])
corr_matrix = np.corrcoef(x, y)
print(f"\n相关系数矩阵:\n{corr_matrix}")
print(f"皮尔逊相关系数: {corr_matrix[0, 1]:.4f}")
# ============================================================
# 直方图与分箱
# ============================================================
data = np.random.randn(1000)
hist, bin_edges = np.histogram(data, bins=20)
print(f"\n直方图计数: {hist}")
print(f"分箱边界: {bin_edges}")
# 数字分箱
ages = np.array([15, 22, 35, 42, 55, 68, 73])
bins = [0, 18, 35, 60, 100]
labels = ["未成年", "青年", "中年", "老年"]
categories = np.digitize(ages, bins) - 1
print(f"年龄分组索引: {categories}")
print(f"年龄分组: {np.array(labels)[categories]}")
4. 多项式拟合
# ============================================================
# 多项式回归
# ============================================================
x = np.array([0, 1, 2, 3, 4, 5])
y = np.array([1.1, 3.5, 8.2, 15.9, 26.2, 39.1]) # 大致 y = x² + 2x + 1 + noise
# 拟合二次多项式
coeffs = np.polyfit(x, y, deg=2) # 返回 [a, b, c] 对应 ax² + bx + c
print(f"\n拟合系数: a={coeffs[0]:.4f}, b={coeffs[1]:.4f}, c={coeffs[2]:.4f}")
# 预测
x_new = np.linspace(0, 6, 50)
y_pred = np.polyval(coeffs, x_new)
# 计算 R²
y_fit = np.polyval(coeffs, x)
ss_res = np.sum((y - y_fit) ** 2)
ss_tot = np.sum((y - np.mean(y)) ** 2)
r2 = 1 - ss_res / ss_tot
print(f"R² = {r2:.6f}")
5. 信号处理(简易)
# ============================================================
# FFT 快速傅里叶变换
# ============================================================
sr = 1000 # 采样率 1000Hz
t = np.linspace(0, 1, sr, endpoint=False)
# 50Hz + 120Hz 的两个正弦波叠加
signal = np.sin(2 * np.pi * 50 * t) + 0.5 * np.sin(2 * np.pi * 120 * t)
# FFT
fft_vals = np.fft.fft(signal)
freqs = np.fft.fftfreq(len(signal), 1 / sr)
# 取正频率部分
positive_mask = freqs > 0
freqs_pos = freqs[positive_mask]
magnitude = np.abs(fft_vals[positive_mask])
# 找峰值频率
peak_idx = np.argsort(magnitude)[-5:] # 前5个最大幅值
print(f"\n主要频率成分 (Hz): {freqs_pos[peak_idx]}")
print(f"对应幅值: {magnitude[peak_idx].round(2)}")
# 应能找到 50Hz 和 120Hz
关键要点
| 函数 |
说明 |
np.linalg.solve(A, b) |
解 Ax = b |
np.linalg.eig(A) |
特征值与特征向量 |
np.linalg.svd(M) |
奇异值分解 |
np.polyfit(x, y, deg) |
多项式拟合 |
np.fft.fft(signal) |
快速傅里叶变换 |
np.corrcoef(x, y) |
相关系数矩阵 |
np.isnan(x) |
检测 NaN |