LOADING

加载过慢请开启缓存 浏览器默认开启

ML-Titanic-day5

2025/1/24 ml

优化了脚本,补全了一些缺失。

submission_fastai.csv
Complete · 10s ago
0.76315

比之前好太多了

import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from pathlib import Path

# fastai
from fastai.tabular.all import *

# ========== 1. 读取数据 ==========
path = Path('.')  # 假设脚本和数据在同一目录
train_df_raw = pd.read_csv(path/'train.csv')
test_df_raw = pd.read_csv(path/'test.csv')

# ========== 2. 删除无用列 ==========
# 这些列通常对预测贡献不大或很难利用,直接丢弃
useless_cols = ['PassengerId','Name','Ticket','Cabin']
for c in useless_cols:
    if c in train_df_raw.columns:
        train_df_raw.drop(c, axis=1, inplace=True)
    if c in test_df_raw.columns:
        test_df_raw.drop(c, axis=1, inplace=True)

# ========== 3. 处理可能出现但训练集中无缺失的列(如 Fare)==========
# 如果训练集里Fare没有缺失,而测试集有 => fastai FillMissing会报nan assertion错误
# 因此手动填补测试集的Fare缺失(如用训练集的中位数)
if train_df_raw['Fare'].isnull().sum() == 0 and test_df_raw['Fare'].isnull().sum() > 0:
    fare_median = train_df_raw['Fare'].median()
    test_df_raw['Fare'] = test_df_raw['Fare'].fillna(fare_median)

# 如果Embarked缺失较少,也可手动丢弃对应行,或让fastai自动FillMissing
# 例如强行删掉极少量缺失Embarked的行
if train_df_raw['Embarked'].isnull().sum() > 0:
    train_df_raw.dropna(subset=['Embarked'], inplace=True)

# 查看缺失情况
print("训练集各列缺失数:\n", train_df_raw.isnull().sum())
print("测试集各列缺失数:\n", test_df_raw.isnull().sum())

# ========== 4. fastai 表格分类的关键配置 ==========
y_name = 'Survived'  # 目标列
cat_names = ['Pclass','Sex','Embarked']   # 类别特征
cont_names = ['Age','SibSp','Parch','Fare']  # 数值特征

# fastai 建议将标签转为分类类型(若它是0/1 int也可不转)
train_df_raw[y_name] = train_df_raw[y_name].astype('category')

# ========== 5. 构建 DataLoaders 并划分验证集 ==========
# 利用 random splitter 拆分 80%训练/20%验证
splits = RandomSplitter(valid_pct=0.2, seed=42)(range_of(train_df_raw))

dls = TabularDataLoaders.from_df(
    df=train_df_raw,
    path='.',
    cat_names=cat_names,
    cont_names=cont_names,
    y_names=y_name,
    y_block=CategoryBlock(),  # 指明这是一个分类任务
    procs=[Categorify, FillMissing, Normalize],  # fastai流水线
    splits=splits,
    bs=64
)

# 看看一个batch
dls.show_batch()

# ========== 6. 创建模型并训练 ==========
learn = tabular_learner(
    dls,
    layers=[200,100],   # 两层隐藏层(可调)
    metrics=accuracy
)

# 可用learn.lr_find()搜索学习率,这里直接设个1e-3
learn.fit_one_cycle(120, 1e-3)  # 训练5epoch

# 查看验证集表现
val_loss, val_acc = learn.validate()
print(f'验证集: loss={val_loss:.4f}, accuracy={val_acc:.4f}')

# 也可查看混淆矩阵
interp = ClassificationInterpretation.from_learner(learn)
interp.plot_confusion_matrix(figsize=(4,4))

# ========== 7. 对测试集做推断并生成提交文件 ==========
# fastai 默认: 先构建 test_dl
test_dl = learn.dls.test_dl(test_df_raw)

# 获取预测概率
preds, _ = learn.get_preds(dl=test_dl)
# argmax转成0/1
class_preds = preds.argmax(dim=1).numpy()

# kaggle提交 需要  PassengerId + Survived
# 因为前面我们删掉了 test_df_raw 里的PassengerId,需要重新读取
test_ids = pd.read_csv(path/'test.csv', usecols=['PassengerId'])

submission = pd.DataFrame({
    'PassengerId': test_ids['PassengerId'],
    'Survived': class_preds
})

submission.to_csv('submission_fastai.csv', index=False)
print("预测提交文件已保存: submission_fastai.csv")