import pandas as pd
import numpy as np
import xgboost as xgb
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, classification_report
from sklearn.utils.class_weight import compute_sample_weight5. Final Model Training and Holdout Evaluation
Load engineered data
df = pd.read_csv("engineered_heart_data.csv")
X = df.drop("target", axis=1)
y = df["target"]
X_trainval, X_holdout, y_trainval, y_holdout = train_test_split(X, y, stratify=y, test_size=0.2, random_state=42)
weights_full = compute_sample_weight(class_weight="balanced", y=y_trainval)Final training with best hyperparameters (insert values)
# Use best hyperparameters from tuning notebook
BEST_LEARNING_RATE = 0.05 # Replace with tuned value
BEST_THRESHOLD = 0.45 # Replace with tuned threshold
final_model = xgb.XGBClassifier(
use_label_encoder=False,
eval_metric="logloss",
objective="binary:logistic",
learning_rate=BEST_LEARNING_RATE,
n_estimators=50,
random_state=42
)
final_model.fit(X_trainval, y_trainval, sample_weight=weights_full)Evaluate on holdout set
def evaluate(name, y_true, y_pred):
print(f"\n{name} Evaluation:")
print("Accuracy :", accuracy_score(y_true, y_pred))
print("Precision:", precision_score(y_true, y_pred))
print("Recall :", recall_score(y_true, y_pred))
print("F1 Score :", f1_score(y_true, y_pred))
print("\nClassification Report:\n", classification_report(y_true, y_pred))# Predict with threshold
probs = final_model.predict_proba(X_holdout)[:, 1]
preds = (probs >= BEST_THRESHOLD).astype(int)
evaluate("Holdout Set", y_holdout, preds)Save predictions
output = X_holdout.copy()
output["actual"] = y_holdout
output["predicted"] = preds
output.to_csv("final_holdout_predictions.csv", index=False)
print("✅ Saved final predictions to 'final_holdout_predictions.csv'")