# Import libraries necessary for this project
import numpy as np
import pandas as pd
from time import time
from IPython.display import display # Allows the use of display() for DataFrames
import sys
from pathlib import Path

# Add the repository root to Python path so "src" can be imported
repo_root = Path().resolve().parent if Path().resolve().name == "notebooks" else Path().resolve()
sys.path.insert(0, str(repo_root))

# Import supplementary visualization code visuals.py
from src import visuals as vs

# Pretty display for notebooks
%matplotlib inline

# Load the Census dataset
data = pd.read_csv(repo_root / "data/census.csv")

# Success - Display the first record
display(data.head(n=1))

# Total number of records
n_records = data.shape[0]

# Number of records where individual's income is more than $50,000
n_greater_50k = data["income"].value_counts()[">50K"]

# Number of records where individual's income is at most $50,000
n_at_most_50k = data["income"].value_counts()["<=50K"]

# Percentage of individuals whose income is more than $50,000
greater_percent = (n_greater_50k / n_records) * 100

# Print the results
print("Total number of records: {}".format(n_records))
print("Individuals making more than $50,000: {}".format(n_greater_50k))
print("Individuals making at most $50,000: {}".format(n_at_most_50k))
print("Percentage of individuals making more than $50,000: {}%".format(greater_percent))

Total number of records: 45222
Individuals making more than $50,000: 11208
Individuals making at most $50,000: 34014
Percentage of individuals making more than $50,000: 24.78439697492371%

# Split the data into features and target label
income_raw = data['income']
features_raw = data.drop('income', axis = 1)

# Visualize skewed continuous features of original data
vs.distribution(data)

# Log-transform the skewed features
skewed = ['capital-gain', 'capital-loss']
features_log_transformed = pd.DataFrame(data = features_raw)
features_log_transformed[skewed] = features_raw[skewed].apply(lambda x: np.log(x + 1))

# Visualize the new log distributions
vs.distribution(features_log_transformed, transformed = True)

# Import sklearn.preprocessing.StandardScaler
from sklearn.preprocessing import MinMaxScaler

# Initialize a scaler, then apply it to the features
scaler = MinMaxScaler() # default=(0, 1)
numerical = ['age', 'education-num', 'capital-gain', 'capital-loss', 'hours-per-week']

features_log_minmax_transform = pd.DataFrame(data = features_log_transformed)
features_log_minmax_transform[numerical] = scaler.fit_transform(features_log_transformed[numerical])

# Show an example of a record with scaling applied
display(features_log_minmax_transform.head(n = 5))

# One-hot encode the 'features_log_minmax_transform' data using pandas.get_dummies()

# A list of the categorical column names
cat_features = features_log_minmax_transform.select_dtypes(include=["object"])
cat_cols = cat_features.columns

# Drop original categorical features and create dummies for them
features_final = pd.concat(
    [
        features_log_minmax_transform.drop(cat_cols, axis=1),  # Drop all categorical columns
        pd.get_dummies(features_log_minmax_transform[cat_cols], prefix=cat_cols, prefix_sep='_', drop_first=True)  # Create dummies for all categorical columns
    ],
    axis=1
)

# Encode the 'income_raw' data to numerical values
income = income_raw.copy()
income = income_raw.map({">50K": 1, "<=50K": 0}).astype(int)

# Print the number of features after one-hot encoding
encoded = list(features_final.columns)
print("{} total features after one-hot encoding.".format(len(encoded)))

# Uncomment the following line to see the encoded feature names
#print(encoded)
#print(income.head())
#print(features_final.head())

95 total features after one-hot encoding.

# Import train_test_split
from sklearn.model_selection import train_test_split

# Split the 'features' and 'income' data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(features_final, 
                                                    income, 
                                                    test_size = 0.2, 
                                                    random_state = 0)

# Show the results of the split
print("Training set has {} samples.".format(X_train.shape[0]))
print("Testing set has {} samples.".format(X_test.shape[0]))

Training set has 36177 samples.
Testing set has 9045 samples.

TP = np.sum(income) # Counting the ones as this is the naive case. Note that 'income' is the 'income_raw' data encoded to numerical values done in the data preprocessing step.
FP = income.count() - TP # Specific to the naive case

TN = 0 # No predicted negatives in the naive case
FN = 0 # No predicted negatives in the naive case

# Calculate accuracy, precision and recall
accuracy = (TP + TN)/(TP + TN + FP + FN)
recall = TP / (TP + FN)
precision = TP / (TP + FP)

# Calculate F-score using the formula above for beta = 0.5 and correct values for precision and recall.
fscore = ((1 + 0.5**2) * precision * recall)/((0.5**2 * precision) + recall)

# Print the results 
print("Naive Predictor: [Accuracy score: {:.4f}, F-score: {:.4f}]".format(accuracy, fscore))

Naive Predictor: [Accuracy score: 0.2478, F-score: 0.2917]

# Import two metrics from sklearn - fbeta_score and accuracy_score
from sklearn.metrics import accuracy_score, fbeta_score

def train_predict(learner, sample_size, X_train, y_train, X_test, y_test): 
    '''
    inputs:
       - learner: the learning algorithm to be trained and predicted on
       - sample_size: the size of samples (number) to be drawn from training set
       - X_train: features training set
       - y_train: income training set
       - X_test: features testing set
       - y_test: income testing set
    '''
    
    results = {}

    X_train_sample = X_train[:sample_size]
    y_train_sample = y_train[:sample_size]   
    
    # Fit the learner to the training data using slicing with 'sample_size' using .fit(training_features[:], training_labels[:])
    start = time() # Get start time
    learner = learner.fit(X_train_sample, y_train_sample)
    end = time() # Get end time
    
    # Calculate the training time
    results['train_time'] = end - start  
        
    # Get the predictions on the test set(X_test),
    #       then get predictions on the first 300 training samples(X_train) using .predict()
    start = time() # Get start time
    predictions_test = learner.predict(X_test)
    predictions_train = learner.predict(X_train_sample[:300])
    end = time() # Get end time
    
    # Calculate the total prediction time
    results['pred_time'] = end - start
            
    # Compute accuracy on the first 300 training samples which is y_train[:300]
    results['acc_train'] = accuracy_score(y_train_sample[:300], predictions_train)
        
    # Compute accuracy on test set using accuracy_score()
    results['acc_test'] = accuracy_score(y_test, predictions_test, )
    
    # Compute F-score on the the first 300 training samples using fbeta_score()
    results['f_train'] = fbeta_score(y_train_sample[:300], predictions_train, beta=0.5)
        
    # Compute F-score on the test set which is y_test
    results['f_test'] = fbeta_score(y_test, predictions_test, beta=0.5)
       
    # Success
    print("{} trained on {} samples.".format(learner.__class__.__name__, sample_size))
        
    # Return the results
    return results

# Import the three supervised learning models from sklearn
from sklearn.ensemble import AdaBoostClassifier, RandomForestClassifier
from sklearn.linear_model import LogisticRegression

# Initialize the three models
clf_A = LogisticRegression(random_state=0)
clf_B = RandomForestClassifier(random_state=0)
clf_C = AdaBoostClassifier(random_state=0)

# Calculate the number of samples for 1%, 10%, and 100% of the training data
samples_100 = len(y_train)
samples_10 = int(0.1 * len(y_train))
samples_1 = int(0.01 * len(y_train))    

# Collect results on the learners
results = {}
for clf in [clf_A, clf_B, clf_C]:
    clf_name = clf.__class__.__name__
    results[clf_name] = {}
    for i, samples in enumerate([samples_1, samples_10, samples_100]):
        results[clf_name][i] = \
        train_predict(clf, samples, X_train, y_train, X_test, y_test)

# Run metrics visualization for the three supervised learning models chosen
vs.evaluate(results, accuracy, fscore)

LogisticRegression trained on 361 samples.
LogisticRegression trained on 3617 samples.
LogisticRegression trained on 36177 samples.
RandomForestClassifier trained on 361 samples.
RandomForestClassifier trained on 3617 samples.
RandomForestClassifier trained on 36177 samples.
AdaBoostClassifier trained on 361 samples.
AdaBoostClassifier trained on 3617 samples.
AdaBoostClassifier trained on 36177 samples.

C:\Users\amir_\OneDrive\Documentos\Etudes\machine-learning\introduction-to-machine-learning-with-pytorch\project\finding-donors-ml\src\visuals.py:125: UserWarning: Tight layout not applied. tight_layout cannot make Axes width small enough to accommodate all Axes decorations
  pl.tight_layout()

print(f"Logistic Regression F0.5 on testing for 100% training data:  {results['LogisticRegression'][2]['f_test']:.3f}")
print(f"Random Forest F0.5 on testing for 100% training data:  {results['RandomForestClassifier'][2]['f_test']:.3f}")
print(f"AdaBoost F0.5 on testing for 100% training data:  {results['AdaBoostClassifier'][2]['f_test']:.3f}")
print()
print(f"Logistic Regression Training Time for 100% training data:  {results['LogisticRegression'][2]['train_time']:.3f}")
print(f"Random Forest Training Time for 100% training data:  {results['RandomForestClassifier'][2]['train_time']:.3f}")
print(f"AdaBoost Training Time for 100% training data:  {results['AdaBoostClassifier'][2]['train_time']:.3f}")
print()
print(f"Logistic Regression Prediction Time for 100% training data:  {results['LogisticRegression'][2]['pred_time']:.3f}")
print(f"Random Forest Prediction Time for 100% training data:  {results['RandomForestClassifier'][2]['pred_time']:.3f}")
print(f"AdaBoost Prediction Time for 100% training data:  {results['AdaBoostClassifier'][2]['pred_time']:.3f}")

Logistic Regression F0.5 on testing for 100% training data:  0.683
Random Forest F0.5 on testing for 100% training data:  0.680
AdaBoost F0.5 on testing for 100% training data:  0.703

Logistic Regression Training Time for 100% training data:  0.457
Random Forest Training Time for 100% training data:  4.722
AdaBoost Training Time for 100% training data:  1.980

Logistic Regression Prediction Time for 100% training data:  0.009
Random Forest Prediction Time for 100% training data:  0.251
AdaBoost Prediction Time for 100% training data:  0.174

# Import 'GridSearchCV', 'make_scorer', and any other necessary libraries
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import make_scorer
from sklearn.ensemble import AdaBoostClassifier
from sklearn.tree import DecisionTreeClassifier

# Initialize the classifier
base  = DecisionTreeClassifier(max_depth=1)
clf = AdaBoostClassifier(estimator = base, random_state=0)

# Create the parameters list you wish to tune, using a dictionary if needed.
parameters = {'n_estimators': [50, 100, 200, 400], 'learning_rate': [0.01, 0.03, 0.1, 0.3, 1]}

# Make an fbeta_score scoring object using make_scorer()
scorer = make_scorer(fbeta_score, beta=0.5)

# Perform grid search on the classifier using 'scorer' as the scoring method using GridSearchCV()
grid_obj = GridSearchCV(clf, parameters, scoring=scorer, verbose=3)

# Fit the grid search object to the training data and find the optimal parameters using fit()
grid_fit = grid_obj.fit(X_train, y_train)

# Get the estimator
best_clf = grid_fit.best_estimator_

# Make predictions using the unoptimized and model
predictions = (clf.fit(X_train, y_train)).predict(X_test)
best_predictions = best_clf.predict(X_test)

# Report the before-and-afterscores
print("Unoptimized model\n------")
print("Accuracy score on testing data: {:.4f}".format(accuracy_score(y_test, predictions)))
print("F-score on testing data: {:.4f}".format(fbeta_score(y_test, predictions, beta = 0.5)))
print("\nOptimized Model\n------")
print("Final accuracy score on the testing data: {:.4f}".format(accuracy_score(y_test, best_predictions)))
print("Final F-score on the testing data: {:.4f}".format(fbeta_score(y_test, best_predictions, beta = 0.5)))

Fitting 5 folds for each of 20 candidates, totalling 100 fits
[CV 1/5] END learning_rate=0.01, n_estimators=50;, score=0.000 total time=   1.6s
[CV 2/5] END learning_rate=0.01, n_estimators=50;, score=0.000 total time=   1.6s
[CV 3/5] END learning_rate=0.01, n_estimators=50;, score=0.000 total time=   1.6s
[CV 4/5] END learning_rate=0.01, n_estimators=50;, score=0.000 total time=   1.6s
[CV 5/5] END learning_rate=0.01, n_estimators=50;, score=0.000 total time=   1.6s
[CV 1/5] END learning_rate=0.01, n_estimators=100;, score=0.000 total time=   3.3s
[CV 2/5] END learning_rate=0.01, n_estimators=100;, score=0.000 total time=   3.2s
[CV 3/5] END learning_rate=0.01, n_estimators=100;, score=0.000 total time=   3.3s
[CV 4/5] END learning_rate=0.01, n_estimators=100;, score=0.000 total time=   3.2s
[CV 5/5] END learning_rate=0.01, n_estimators=100;, score=0.000 total time=   3.2s
[CV 1/5] END learning_rate=0.01, n_estimators=200;, score=0.468 total time=   6.5s
[CV 2/5] END learning_rate=0.01, n_estimators=200;, score=0.462 total time=   6.5s
[CV 3/5] END learning_rate=0.01, n_estimators=200;, score=0.484 total time=   6.4s
[CV 4/5] END learning_rate=0.01, n_estimators=200;, score=0.469 total time=   6.5s
[CV 5/5] END learning_rate=0.01, n_estimators=200;, score=0.440 total time=   6.5s
[CV 1/5] END learning_rate=0.01, n_estimators=400;, score=0.466 total time=  13.1s
[CV 2/5] END learning_rate=0.01, n_estimators=400;, score=0.459 total time=  13.1s
[CV 3/5] END learning_rate=0.01, n_estimators=400;, score=0.484 total time=  12.8s
[CV 4/5] END learning_rate=0.01, n_estimators=400;, score=0.465 total time=  12.9s
[CV 5/5] END learning_rate=0.01, n_estimators=400;, score=0.437 total time=  12.8s
[CV 1/5] END learning_rate=0.03, n_estimators=50;, score=0.293 total time=   1.6s
[CV 2/5] END learning_rate=0.03, n_estimators=50;, score=0.304 total time=   1.6s
[CV 3/5] END learning_rate=0.03, n_estimators=50;, score=0.332 total time=   1.6s
[CV 4/5] END learning_rate=0.03, n_estimators=50;, score=0.346 total time=   1.6s
[CV 5/5] END learning_rate=0.03, n_estimators=50;, score=0.309 total time=   1.6s
[CV 1/5] END learning_rate=0.03, n_estimators=100;, score=0.464 total time=   3.2s
[CV 2/5] END learning_rate=0.03, n_estimators=100;, score=0.457 total time=   3.2s
[CV 3/5] END learning_rate=0.03, n_estimators=100;, score=0.343 total time=   3.2s
[CV 4/5] END learning_rate=0.03, n_estimators=100;, score=0.465 total time=   3.2s
[CV 5/5] END learning_rate=0.03, n_estimators=100;, score=0.436 total time=   3.2s
[CV 1/5] END learning_rate=0.03, n_estimators=200;, score=0.629 total time=   6.4s
[CV 2/5] END learning_rate=0.03, n_estimators=200;, score=0.634 total time=   6.6s
[CV 3/5] END learning_rate=0.03, n_estimators=200;, score=0.637 total time=   6.8s
[CV 4/5] END learning_rate=0.03, n_estimators=200;, score=0.469 total time=   6.4s
[CV 5/5] END learning_rate=0.03, n_estimators=200;, score=0.439 total time=   6.4s
[CV 1/5] END learning_rate=0.03, n_estimators=400;, score=0.694 total time=  12.9s
[CV 2/5] END learning_rate=0.03, n_estimators=400;, score=0.710 total time=  12.9s
[CV 3/5] END learning_rate=0.03, n_estimators=400;, score=0.689 total time=  12.8s
[CV 4/5] END learning_rate=0.03, n_estimators=400;, score=0.689 total time=  12.8s
[CV 5/5] END learning_rate=0.03, n_estimators=400;, score=0.705 total time=  12.9s
[CV 1/5] END learning_rate=0.1, n_estimators=50;, score=0.676 total time=   1.5s
[CV 2/5] END learning_rate=0.1, n_estimators=50;, score=0.691 total time=   1.6s
[CV 3/5] END learning_rate=0.1, n_estimators=50;, score=0.676 total time=   1.5s
[CV 4/5] END learning_rate=0.1, n_estimators=50;, score=0.671 total time=   1.6s
[CV 5/5] END learning_rate=0.1, n_estimators=50;, score=0.682 total time=   1.6s
[CV 1/5] END learning_rate=0.1, n_estimators=100;, score=0.693 total time=   3.2s
[CV 2/5] END learning_rate=0.1, n_estimators=100;, score=0.707 total time=   3.3s
[CV 3/5] END learning_rate=0.1, n_estimators=100;, score=0.684 total time=   3.2s
[CV 4/5] END learning_rate=0.1, n_estimators=100;, score=0.686 total time=   3.2s
[CV 5/5] END learning_rate=0.1, n_estimators=100;, score=0.697 total time=   3.2s
[CV 1/5] END learning_rate=0.1, n_estimators=200;, score=0.696 total time=   6.5s
[CV 2/5] END learning_rate=0.1, n_estimators=200;, score=0.713 total time=   6.4s
[CV 3/5] END learning_rate=0.1, n_estimators=200;, score=0.697 total time=   6.4s
[CV 4/5] END learning_rate=0.1, n_estimators=200;, score=0.701 total time=   6.5s
[CV 5/5] END learning_rate=0.1, n_estimators=200;, score=0.712 total time=   6.4s
[CV 1/5] END learning_rate=0.1, n_estimators=400;, score=0.714 total time=  12.8s
[CV 2/5] END learning_rate=0.1, n_estimators=400;, score=0.730 total time=  12.9s
[CV 3/5] END learning_rate=0.1, n_estimators=400;, score=0.720 total time=  12.8s
[CV 4/5] END learning_rate=0.1, n_estimators=400;, score=0.717 total time=  12.8s
[CV 5/5] END learning_rate=0.1, n_estimators=400;, score=0.729 total time=  13.2s
[CV 1/5] END learning_rate=0.3, n_estimators=50;, score=0.692 total time=   1.6s
[CV 2/5] END learning_rate=0.3, n_estimators=50;, score=0.709 total time=   1.6s
[CV 3/5] END learning_rate=0.3, n_estimators=50;, score=0.699 total time=   1.6s
[CV 4/5] END learning_rate=0.3, n_estimators=50;, score=0.699 total time=   1.6s
[CV 5/5] END learning_rate=0.3, n_estimators=50;, score=0.709 total time=   1.6s
[CV 1/5] END learning_rate=0.3, n_estimators=100;, score=0.709 total time=   3.2s
[CV 2/5] END learning_rate=0.3, n_estimators=100;, score=0.725 total time=   3.2s
[CV 3/5] END learning_rate=0.3, n_estimators=100;, score=0.720 total time=   3.2s
[CV 4/5] END learning_rate=0.3, n_estimators=100;, score=0.712 total time=   3.2s
[CV 5/5] END learning_rate=0.3, n_estimators=100;, score=0.721 total time=   3.2s
[CV 1/5] END learning_rate=0.3, n_estimators=200;, score=0.718 total time=   6.5s
[CV 2/5] END learning_rate=0.3, n_estimators=200;, score=0.733 total time=   6.6s
[CV 3/5] END learning_rate=0.3, n_estimators=200;, score=0.719 total time=   6.6s
[CV 4/5] END learning_rate=0.3, n_estimators=200;, score=0.720 total time=   6.7s
[CV 5/5] END learning_rate=0.3, n_estimators=200;, score=0.730 total time=   6.8s
[CV 1/5] END learning_rate=0.3, n_estimators=400;, score=0.719 total time=  12.8s
[CV 2/5] END learning_rate=0.3, n_estimators=400;, score=0.741 total time=  13.3s
[CV 3/5] END learning_rate=0.3, n_estimators=400;, score=0.714 total time=  12.8s
[CV 4/5] END learning_rate=0.3, n_estimators=400;, score=0.723 total time=  12.8s
[CV 5/5] END learning_rate=0.3, n_estimators=400;, score=0.731 total time=  12.8s
[CV 1/5] END ..learning_rate=1, n_estimators=50;, score=0.704 total time=   1.6s
[CV 2/5] END ..learning_rate=1, n_estimators=50;, score=0.722 total time=   1.6s
[CV 3/5] END ..learning_rate=1, n_estimators=50;, score=0.707 total time=   1.5s
[CV 4/5] END ..learning_rate=1, n_estimators=50;, score=0.716 total time=   1.5s
[CV 5/5] END ..learning_rate=1, n_estimators=50;, score=0.719 total time=   1.5s
[CV 1/5] END .learning_rate=1, n_estimators=100;, score=0.706 total time=   3.2s
[CV 2/5] END .learning_rate=1, n_estimators=100;, score=0.726 total time=   3.1s
[CV 3/5] END .learning_rate=1, n_estimators=100;, score=0.719 total time=   3.1s
[CV 4/5] END .learning_rate=1, n_estimators=100;, score=0.711 total time=   3.2s
[CV 5/5] END .learning_rate=1, n_estimators=100;, score=0.724 total time=   3.1s
[CV 1/5] END .learning_rate=1, n_estimators=200;, score=0.711 total time=   6.4s
[CV 2/5] END .learning_rate=1, n_estimators=200;, score=0.731 total time=   6.3s
[CV 3/5] END .learning_rate=1, n_estimators=200;, score=0.712 total time=   6.4s
[CV 4/5] END .learning_rate=1, n_estimators=200;, score=0.712 total time=   6.4s
[CV 5/5] END .learning_rate=1, n_estimators=200;, score=0.725 total time=   6.4s
[CV 1/5] END .learning_rate=1, n_estimators=400;, score=0.721 total time=  12.7s
[CV 2/5] END .learning_rate=1, n_estimators=400;, score=0.737 total time=  12.9s
[CV 3/5] END .learning_rate=1, n_estimators=400;, score=0.715 total time=  12.8s
[CV 4/5] END .learning_rate=1, n_estimators=400;, score=0.718 total time=  12.8s
[CV 5/5] END .learning_rate=1, n_estimators=400;, score=0.727 total time=  12.8s
Unoptimized model
------
Accuracy score on testing data: 0.8483
F-score on testing data: 0.7029

Optimized Model
------
Final accuracy score on the testing data: 0.8507
Final F-score on the testing data: 0.7122

print(best_clf)
print(grid_obj)

AdaBoostClassifier(estimator=DecisionTreeClassifier(max_depth=1),
                   learning_rate=0.3, n_estimators=400, random_state=0)
GridSearchCV(estimator=AdaBoostClassifier(estimator=DecisionTreeClassifier(max_depth=1),
                                          random_state=0),
             param_grid={'learning_rate': [0.01, 0.03, 0.1, 0.3, 1],
                         'n_estimators': [50, 100, 200, 400]},
             scoring=make_scorer(fbeta_score, response_method='predict', beta=0.5),
             verbose=3)

# Train the supervised model on the training set using .fit(X_train, y_train)
start = time()
model = best_clf.fit(X_train, y_train)
end = time()

print(f"Training time for the optimized model: {end - start:.3f} seconds")

# Extract the feature importances using .feature_importances_ 
importances = model.feature_importances_

# Plot
vs.feature_plot(importances, X_train, y_train)

Training time for the optimized model: 14.839 seconds

# Import functionality for cloning a model
from sklearn.base import clone

# Reduce the feature space
X_train_reduced = X_train[X_train.columns.values[(np.argsort(importances)[::-1])[:5]]]
X_test_reduced = X_test[X_test.columns.values[(np.argsort(importances)[::-1])[:5]]]

# Train on the "best" model found from grid search earlier
start = time()
clf = (clone(best_clf)).fit(X_train_reduced, y_train)
end = time()

print(f"Training time for the optimized model on reduced data: {end - start:.3f} seconds")
print()

# Make new predictions
reduced_predictions = clf.predict(X_test_reduced)

# Report scores from the final model using both versions of data
print("Final Model trained on full data\n------")
print("Accuracy on testing data: {:.4f}".format(accuracy_score(y_test, best_predictions)))
print("F-score on testing data: {:.4f}".format(fbeta_score(y_test, best_predictions, beta = 0.5)))
print("\nFinal Model trained on reduced data\n------")
print("Accuracy on testing data: {:.4f}".format(accuracy_score(y_test, reduced_predictions)))
print("F-score on testing data: {:.4f}".format(fbeta_score(y_test, reduced_predictions, beta = 0.5)))

Training time for the optimized model on reduced data: 3.336 seconds

Final Model trained on full data
------
Accuracy on testing data: 0.8507
F-score on testing data: 0.7122

Final Model trained on reduced data
------
Accuracy on testing data: 0.8463
F-score on testing data: 0.7047

	age	workclass	education_level	education-num	marital-status	occupation	relationship	race	sex	capital-gain	hours-per-week	native-country
0	0.301370	State-gov	Bachelors	0.800000	Never-married	Adm-clerical	Not-in-family	White	Male	0.667492	0.397959	United-States
1	0.452055	Self-emp-not-inc	Bachelors	0.800000	Married-civ-spouse	Exec-managerial	Husband	White	Male	0.000000	0.122449	United-States
2	0.287671	Private	HS-grad	0.533333	Divorced	Handlers-cleaners	Not-in-family	White	Male	0.000000	0.397959	United-States
3	0.493151	Private	11th	0.400000	Married-civ-spouse	Handlers-cleaners	Husband	Black	Male	0.000000	0.397959	United-States
4	0.150685	Private	Bachelors	0.800000	Married-civ-spouse	Prof-specialty	Wife	Black	Female	0.000000	0.397959	Cuba

Model (100% train)	F0.5 Test	Training Time (s)	Prediction Time (s)
Logistic Regression	0.683	0.539	0.009
Random Forest	0.680	4.626	0.245
AdaBoost	0.703	1.966	0.162

Finding Donors: Income Prediction Case Study¶

Problem Framing¶

Data Overview¶

Results Summary¶

Key numbers (test set)¶

Model comparison (100% training data)¶

Final choice¶

Exploring the Data¶

Data overview (quick EDA)¶

Preparing the Data¶

Transforming Skewed Continuous Features¶

Normalizing Numerical Features¶

Preprocessing pipeline¶

Train/test split¶

Evaluating Model Performance¶

Baseline + metrics¶

Naive Predictor Performace¶

Models Compared¶

AdaBoost (Ensemble method) (Top performer candidate)¶

Logistic Regression (Fast, dependable baseline)¶

Random Forest (Ensemble method) (balanced)¶

Implementation - Creating a Training and Predicting Pipeline¶

Implementation: Initial Model Evaluation¶

Result Table¶

Best Model + tuning + explanation¶

Model Selection¶

Describing the Model in Layman's Terms¶

Hyperparameter tuning¶

Final tuned model performance¶

Feature Importance¶

What features matter most (hypothesis)¶

Observed feature importance¶

Comparison to hypothesis¶

Feature Selection¶

When reduced features are worth it¶

Conclusion and Next Steps¶

Conclusion¶

Next steps (to make this more production-ready)¶