优化了脚本,补全了一些缺失。
submission_fastai.csv
Complete · 10s ago
0.76315
比之前好太多了
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from pathlib import Path
# fastai
from fastai.tabular.all import *
# ========== 1. 读取数据 ==========
path = Path('.') # 假设脚本和数据在同一目录
train_df_raw = pd.read_csv(path/'train.csv')
test_df_raw = pd.read_csv(path/'test.csv')
# ========== 2. 删除无用列 ==========
# 这些列通常对预测贡献不大或很难利用,直接丢弃
useless_cols = ['PassengerId','Name','Ticket','Cabin']
for c in useless_cols:
if c in train_df_raw.columns:
train_df_raw.drop(c, axis=1, inplace=True)
if c in test_df_raw.columns:
test_df_raw.drop(c, axis=1, inplace=True)
# ========== 3. 处理可能出现但训练集中无缺失的列(如 Fare)==========
# 如果训练集里Fare没有缺失,而测试集有 => fastai FillMissing会报nan assertion错误
# 因此手动填补测试集的Fare缺失(如用训练集的中位数)
if train_df_raw['Fare'].isnull().sum() == 0 and test_df_raw['Fare'].isnull().sum() > 0:
fare_median = train_df_raw['Fare'].median()
test_df_raw['Fare'] = test_df_raw['Fare'].fillna(fare_median)
# 如果Embarked缺失较少,也可手动丢弃对应行,或让fastai自动FillMissing
# 例如强行删掉极少量缺失Embarked的行
if train_df_raw['Embarked'].isnull().sum() > 0:
train_df_raw.dropna(subset=['Embarked'], inplace=True)
# 查看缺失情况
print("训练集各列缺失数:\n", train_df_raw.isnull().sum())
print("测试集各列缺失数:\n", test_df_raw.isnull().sum())
# ========== 4. fastai 表格分类的关键配置 ==========
y_name = 'Survived' # 目标列
cat_names = ['Pclass','Sex','Embarked'] # 类别特征
cont_names = ['Age','SibSp','Parch','Fare'] # 数值特征
# fastai 建议将标签转为分类类型(若它是0/1 int也可不转)
train_df_raw[y_name] = train_df_raw[y_name].astype('category')
# ========== 5. 构建 DataLoaders 并划分验证集 ==========
# 利用 random splitter 拆分 80%训练/20%验证
splits = RandomSplitter(valid_pct=0.2, seed=42)(range_of(train_df_raw))
dls = TabularDataLoaders.from_df(
df=train_df_raw,
path='.',
cat_names=cat_names,
cont_names=cont_names,
y_names=y_name,
y_block=CategoryBlock(), # 指明这是一个分类任务
procs=[Categorify, FillMissing, Normalize], # fastai流水线
splits=splits,
bs=64
)
# 看看一个batch
dls.show_batch()
# ========== 6. 创建模型并训练 ==========
learn = tabular_learner(
dls,
layers=[200,100], # 两层隐藏层(可调)
metrics=accuracy
)
# 可用learn.lr_find()搜索学习率,这里直接设个1e-3
learn.fit_one_cycle(120, 1e-3) # 训练5epoch
# 查看验证集表现
val_loss, val_acc = learn.validate()
print(f'验证集: loss={val_loss:.4f}, accuracy={val_acc:.4f}')
# 也可查看混淆矩阵
interp = ClassificationInterpretation.from_learner(learn)
interp.plot_confusion_matrix(figsize=(4,4))
# ========== 7. 对测试集做推断并生成提交文件 ==========
# fastai 默认: 先构建 test_dl
test_dl = learn.dls.test_dl(test_df_raw)
# 获取预测概率
preds, _ = learn.get_preds(dl=test_dl)
# argmax转成0/1
class_preds = preds.argmax(dim=1).numpy()
# kaggle提交 需要 PassengerId + Survived
# 因为前面我们删掉了 test_df_raw 里的PassengerId,需要重新读取
test_ids = pd.read_csv(path/'test.csv', usecols=['PassengerId'])
submission = pd.DataFrame({
'PassengerId': test_ids['PassengerId'],
'Survived': class_preds
})
submission.to_csv('submission_fastai.csv', index=False)
print("预测提交文件已保存: submission_fastai.csv")