LOADING

加载过慢请开启缓存 浏览器默认开启

ML-Titanic-day4

2025/1/20 ml

红温了,这个好难,预测到最后还是不行

predictions.csv

Complete · 9m ago

0.48564

极限了 排名贼难看 13202 招笑了

Titanic - Machine Learning from Disaster | Kaggle

爆破seed

import subprocess
import sys
import time

# 获取pre.py的输出,并提取其中的accuracy值
def get_accuracy_from_pre(seed_value):
    # 调用pre.py并捕获输出
    result = subprocess.run(
        ['python3', 'pre.py', '--seed', str(seed_value)],
        capture_output=True, text=True
    )
    # 输出内容
    print(f"Output for seed {seed_value}:\n{result.stdout}")
    
    # 从输出中提取准确率
    for line in result.stdout.splitlines():
        if "Accuracy:" in line:
            accuracy = float(line.split(":")[1].strip())
            return accuracy
    return None

# 设置初始种子
seed_value = 42
accuracy = 0

# 循环直到准确率大于0.7
while accuracy <= 0.7:
    accuracy = get_accuracy_from_pre(seed_value)
    print(f"Seed {seed_value}: Accuracy {accuracy}")
    
    if accuracy > 0.7:
        print(f"Found seed {seed_value} with accuracy greater than 0.7")
        break
    
    seed_value += 1
    print(accuracy)
    #time.sleep(1)  # 让程序稍微等待,以免调用过于频繁

# 结束
print("Search completed.")

最好最快的就是96

预测代码

from fastai.tabular.all import *
import pandas as pd
import numpy as np
import torch
import matplotlib.pyplot as plt
import random
import sys
import argparse
# 设置随机种子以确保结果可复现
def set_seed(seed):
    random.seed(seed)
    np.random.seed(seed)
    torch.manual_seed(seed)
    torch.cuda.manual_seed(seed)
    torch.cuda.manual_seed_all(seed)  # 如果使用多个GPU
    torch.backends.cudnn.deterministic = True  # 确保每次计算结果一致
    torch.backends.cudnn.benchmark = False  # 防止启用加速

# 设置随机种子
# 获取命令行参数
parser = argparse.ArgumentParser(description="Train model with different seeds")
parser.add_argument('--seed', type=int, required=True, help="Random seed for training")
args = parser.parse_args()

# 设置种子
set_seed(args.seed)
# 检查 GPU 是否可用
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print(f"使用设备: {device}")

# 加载数据
train_df = pd.read_csv('train.csv')
test_df = pd.read_csv('test.csv')

# 保留 PassengerId 以便后续使用
test_passenger_ids = test_df['PassengerId']

# 提取社会称谓(Title)
train_df['Title'] = train_df['Name'].apply(lambda x: x.split(',')[1].split('.')[0].strip())
test_df['Title'] = test_df['Name'].apply(lambda x: x.split(',')[1].split('.')[0].strip())

# 通过 Cabin 特征创建 Deck
train_df['Deck'] = train_df['Cabin'].apply(lambda x: x[0] if pd.notna(x) else 'U')
test_df['Deck'] = test_df['Cabin'].apply(lambda x: x[0] if pd.notna(x) else 'U')

# 删除无关的列
train_df = train_df.drop(['PassengerId', 'Name', 'Cabin', 'Ticket'], axis=1)
test_df = test_df.drop(['PassengerId', 'Name', 'Cabin', 'Ticket'], axis=1)

# 填补缺失值
train_df['Age'] = train_df['Age'].fillna(train_df['Age'].median())  # 使用中位数填充
train_df['Embarked'] = train_df['Embarked'].fillna(train_df['Embarked'].mode()[0])  # 使用众数填充
test_df['Fare'] = test_df['Fare'].fillna(test_df['Fare'].median())  # 填充 Fare 缺失值
test_df['Age'] = test_df['Age'].fillna(train_df['Age'].median())  # 使用训练集的中位数填充测试集

# 1. 计算 FamilySize
train_df['FamilySize'] = train_df['SibSp'] + train_df['Parch'] + 1
test_df['FamilySize'] = test_df['SibSp'] + test_df['Parch'] + 1

# 2. 计算 AgeRange
def age_range(age):
    if age < 12:
        return 'Child'
    elif age < 18:
        return 'Teenager'
    elif age < 60:
        return 'Adult'
    else:
        return 'Senior'

train_df['AgeRange'] = train_df['Age'].apply(age_range)
test_df['AgeRange'] = test_df['Age'].apply(age_range)

# 3. 计算 FareRange
def fare_range(fare):
    if fare < 20:
        return 'Low'
    elif fare < 50:
        return 'Medium'
    else:
        return 'High'

train_df['FareRange'] = train_df['Fare'].apply(fare_range)
test_df['FareRange'] = test_df['Fare'].apply(fare_range)

# 4. 计算 IsSmallFamily 和 IsLargeFamily
train_df['IsSmallFamily'] = (train_df['FamilySize'] <= 3).astype(int)
train_df['IsLargeFamily'] = (train_df['FamilySize'] > 3).astype(int)
test_df['IsSmallFamily'] = (test_df['FamilySize'] <= 3).astype(int)
test_df['IsLargeFamily'] = (test_df['FamilySize'] > 3).astype(int)

# 5. 社会阶层(Title)类别映射
train_df['Title'] = train_df['Title'].map(lambda x: x if x in ['Mr', 'Mrs', 'Miss', 'Master'] else 'Other')
test_df['Title'] = test_df['Title'].map(lambda x: x if x in ['Mr', 'Mrs', 'Miss', 'Master'] else 'Other')

# 6. 性别与舱位组合特征(Sex_Pclass)
train_df['Sex_Pclass'] = train_df['Sex'].astype(str) + '_' + train_df['Pclass'].astype(str)
test_df['Sex_Pclass'] = test_df['Sex'].astype(str) + '_' + test_df['Pclass'].astype(str)

# 7. 港口与票价交互特征(Embarked_Fare)
train_df['Embarked_Fare'] = train_df['Embarked'].astype(str) + '_' + train_df['Fare'].apply(lambda x: 'Low' if x < 50 else 'High')
test_df['Embarked_Fare'] = test_df['Embarked'].astype(str) + '_' + test_df['Fare'].apply(lambda x: 'Low' if x < 50 else 'High')

# 8. 是否从未结婚(IsSingle)
train_df['IsSingle'] = ((train_df['SibSp'] == 0) & (train_df['Parch'] == 0)).astype(int)
test_df['IsSingle'] = ((test_df['SibSp'] == 0) & (test_df['Parch'] == 0)).astype(int)

# 对类别特征进行编码
cat_cols = ['Sex', 'Embarked', 'AgeRange', 'FareRange', 'Title', 'Deck', 'Sex_Pclass', 'Embarked_Fare']
for col in cat_cols:
    train_df[col] = train_df[col].astype('category')
    test_df[col] = test_df[col].astype('category')

# 特征和标签
target = 'Survived'
cont_names = [col for col in train_df.columns if col not in cat_cols + [target]]
cat_names = cat_cols

# 数据预处理
procs = [Categorify, FillMissing, Normalize]

# 创建 DataLoader
train_dl = TabularDataLoaders.from_df(train_df, path='.', valid_pct=0.2, y_names=target, cat_names=cat_names, cont_names=cont_names,
                                      procs=procs, bs=128)

# 训练模型
learn = tabular_learner(train_dl, layers=[512, 256, 128], metrics=[accuracy, RocAucBinary()], wd=1e-4)

# 将模型转移到 GPU
learn.to(device)

# 查找最适合的学习率
learn.lr_find()

# 学习率曲线
plt.figure()
learn.recorder.plot_lr_find()
plt.show()

# 使用 OneCycleLR 进行训练
learn.fit_one_cycle(1000, lr_max=1e-3)

# 绘制损失曲线
plt.figure()
learn.recorder.plot_loss()
plt.show()

# 预测测试集
test_dl = learn.dls.test_dl(test_df)
test_preds, _ = learn.get_preds(dl=test_dl)

# 获取二分类预测(0 或 1)
test_preds = torch.sigmoid(test_preds).numpy()  # Sigmoid 转换,得到 0-1 的概率值
test_preds = np.round(test_preds).clip(0, 1)  # 四舍五入并确保在 [0, 1] 范围内

# 将预测结果与 PassengerId 结合
output_df = pd.DataFrame({
    'PassengerId': test_passenger_ids,  # 保留测试集的 PassengerId
    'Survived': test_preds.astype(int).flatten()  # 确保结果为整数
})

# 保存预测结果
output_df.to_csv('predictions.csv', index=False)

print("预测结果已保存到 predictions.csv")
# 训练和评估过程
val_dl = learn.dls.valid
val_preds, val_y = learn.get_preds(dl=val_dl)
val_accuracy = accuracy(val_preds, val_y)
print(f"Accuracy: {val_accuracy.item()}")