{
"cells": [
{
"cell_type": "markdown",
"metadata": {},
"source": [
"# Data science experiments on credit cards dataset\n",
"\n",
"This notebook attempts to simulate the model training code a data scientist may provide as input to the ML architecture."
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"First, import libraries and load the data."
]
},
{
"cell_type": "code",
"execution_count": 1,
"metadata": {},
"outputs": [
{
"data": {
"text/html": [
"
\n",
"\n",
"
\n",
" \n",
" \n",
" \n",
" Unnamed: 0 \n",
" Age \n",
" Annual_Income \n",
" Credit_Score \n",
" Loan_Amount \n",
" Loan_Duration_Years \n",
" Number_of_Open_Accounts \n",
" Had_Past_Default \n",
" Loan_Approval \n",
" \n",
" \n",
" \n",
" \n",
" 0 \n",
" 0 \n",
" 35.0 \n",
" 107770.0 \n",
" 331.0 \n",
" 31580.0 \n",
" 28 \n",
" 13.0 \n",
" 0 \n",
" 0 \n",
" \n",
" \n",
" 1 \n",
" 1 \n",
" 52.0 \n",
" NaN \n",
" 636.0 \n",
" 9012.0 \n",
" 5 \n",
" 14.0 \n",
" 0 \n",
" 1 \n",
" \n",
" \n",
" 2 \n",
" 2 \n",
" 56.0 \n",
" 160017.0 \n",
" 809.0 \n",
" 45310.0 \n",
" 19 \n",
" 13.0 \n",
" 1 \n",
" 1 \n",
" \n",
" \n",
" 3 \n",
" 3 \n",
" 52.0 \n",
" 41654.0 \n",
" 422.0 \n",
" 47966.0 \n",
" 17 \n",
" 7.0 \n",
" 1 \n",
" 0 \n",
" \n",
" \n",
" 4 \n",
" 4 \n",
" 30.0 \n",
" 73198.0 \n",
" 414.0 \n",
" 35636.0 \n",
" 2 \n",
" 3.0 \n",
" 1 \n",
" 1 \n",
" \n",
" \n",
"
\n",
"
"
],
"text/plain": [
" Unnamed: 0 Age Annual_Income Credit_Score Loan_Amount \\\n",
"0 0 35.0 107770.0 331.0 31580.0 \n",
"1 1 52.0 NaN 636.0 9012.0 \n",
"2 2 56.0 160017.0 809.0 45310.0 \n",
"3 3 52.0 41654.0 422.0 47966.0 \n",
"4 4 30.0 73198.0 414.0 35636.0 \n",
"\n",
" Loan_Duration_Years Number_of_Open_Accounts Had_Past_Default \\\n",
"0 28 13.0 0 \n",
"1 5 14.0 0 \n",
"2 19 13.0 1 \n",
"3 17 7.0 1 \n",
"4 2 3.0 1 \n",
"\n",
" Loan_Approval \n",
"0 0 \n",
"1 1 \n",
"2 1 \n",
"3 0 \n",
"4 1 "
]
},
"execution_count": 1,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"import os\n",
"import numpy as np\n",
"import matplotlib.pyplot as plt\n",
"import pandas as pd\n",
"from sklearn.model_selection import train_test_split, RandomizedSearchCV\n",
"from sklearn.compose import ColumnTransformer\n",
"from sklearn.preprocessing import StandardScaler\n",
"from sklearn.pipeline import Pipeline\n",
"from sklearn.ensemble import GradientBoostingClassifier\n",
"from sklearn.metrics import classification_report, ConfusionMatrixDisplay\n",
"\n",
"# Path to the raw training data\n",
"_data_root = '../data'\n",
"_data_filename = \"dataset.csv\"\n",
"_data_filepath = os.path.join(_data_root, _data_filename)\n",
"dataframe = pd.read_csv(_data_filepath)\n",
"dataframe.head()"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"Print dataset info and description using `pandas`."
]
},
{
"cell_type": "code",
"execution_count": 2,
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"\n",
"RangeIndex: 1000 entries, 0 to 999\n",
"Data columns (total 9 columns):\n",
" # Column Non-Null Count Dtype \n",
"--- ------ -------------- ----- \n",
" 0 Unnamed: 0 1000 non-null int64 \n",
" 1 Age 950 non-null float64\n",
" 2 Annual_Income 970 non-null float64\n",
" 3 Credit_Score 960 non-null float64\n",
" 4 Loan_Amount 980 non-null float64\n",
" 5 Loan_Duration_Years 1000 non-null int64 \n",
" 6 Number_of_Open_Accounts 990 non-null float64\n",
" 7 Had_Past_Default 1000 non-null int64 \n",
" 8 Loan_Approval 1000 non-null int64 \n",
"dtypes: float64(5), int64(4)\n",
"memory usage: 70.4 KB\n"
]
}
],
"source": [
"dataframe.info()"
]
},
{
"cell_type": "code",
"execution_count": 3,
"metadata": {},
"outputs": [
{
"data": {
"text/html": [
"\n",
"\n",
"
\n",
" \n",
" \n",
" \n",
" Unnamed: 0 \n",
" Age \n",
" Annual_Income \n",
" Credit_Score \n",
" Loan_Amount \n",
" Loan_Duration_Years \n",
" Number_of_Open_Accounts \n",
" Had_Past_Default \n",
" Loan_Approval \n",
" \n",
" \n",
" \n",
" \n",
" count \n",
" 1000.000000 \n",
" 950.000000 \n",
" 970.000000 \n",
" 960.000000 \n",
" 980.000000 \n",
" 1000.000000 \n",
" 990.000000 \n",
" 1000.00000 \n",
" 1000.000000 \n",
" \n",
" \n",
" mean \n",
" 499.500000 \n",
" 44.357895 \n",
" 113529.173196 \n",
" 574.825000 \n",
" 28061.729592 \n",
" 14.832000 \n",
" 7.348485 \n",
" 0.51000 \n",
" 0.514000 \n",
" \n",
" \n",
" std \n",
" 288.819436 \n",
" 15.268179 \n",
" 49879.543788 \n",
" 154.573626 \n",
" 12962.369681 \n",
" 8.424057 \n",
" 3.967101 \n",
" 0.50015 \n",
" 0.500054 \n",
" \n",
" \n",
" min \n",
" 0.000000 \n",
" 18.000000 \n",
" 30060.000000 \n",
" 301.000000 \n",
" 5006.000000 \n",
" 1.000000 \n",
" 1.000000 \n",
" 0.00000 \n",
" 0.000000 \n",
" \n",
" \n",
" 25% \n",
" 249.750000 \n",
" 31.000000 \n",
" 67129.750000 \n",
" 442.000000 \n",
" 17662.250000 \n",
" 8.000000 \n",
" 4.000000 \n",
" 0.00000 \n",
" 0.000000 \n",
" \n",
" \n",
" 50% \n",
" 499.500000 \n",
" 45.000000 \n",
" 113365.500000 \n",
" 574.500000 \n",
" 28201.500000 \n",
" 15.000000 \n",
" 7.000000 \n",
" 1.00000 \n",
" 1.000000 \n",
" \n",
" \n",
" 75% \n",
" 749.250000 \n",
" 58.000000 \n",
" 159608.000000 \n",
" 707.000000 \n",
" 38693.750000 \n",
" 22.000000 \n",
" 11.000000 \n",
" 1.00000 \n",
" 1.000000 \n",
" \n",
" \n",
" max \n",
" 999.000000 \n",
" 69.000000 \n",
" 199991.000000 \n",
" 849.000000 \n",
" 49989.000000 \n",
" 29.000000 \n",
" 14.000000 \n",
" 1.00000 \n",
" 1.000000 \n",
" \n",
" \n",
"
\n",
"
"
],
"text/plain": [
" Unnamed: 0 Age Annual_Income Credit_Score Loan_Amount \\\n",
"count 1000.000000 950.000000 970.000000 960.000000 980.000000 \n",
"mean 499.500000 44.357895 113529.173196 574.825000 28061.729592 \n",
"std 288.819436 15.268179 49879.543788 154.573626 12962.369681 \n",
"min 0.000000 18.000000 30060.000000 301.000000 5006.000000 \n",
"25% 249.750000 31.000000 67129.750000 442.000000 17662.250000 \n",
"50% 499.500000 45.000000 113365.500000 574.500000 28201.500000 \n",
"75% 749.250000 58.000000 159608.000000 707.000000 38693.750000 \n",
"max 999.000000 69.000000 199991.000000 849.000000 49989.000000 \n",
"\n",
" Loan_Duration_Years Number_of_Open_Accounts Had_Past_Default \\\n",
"count 1000.000000 990.000000 1000.00000 \n",
"mean 14.832000 7.348485 0.51000 \n",
"std 8.424057 3.967101 0.50015 \n",
"min 1.000000 1.000000 0.00000 \n",
"25% 8.000000 4.000000 0.00000 \n",
"50% 15.000000 7.000000 1.00000 \n",
"75% 22.000000 11.000000 1.00000 \n",
"max 29.000000 14.000000 1.00000 \n",
"\n",
" Loan_Approval \n",
"count 1000.000000 \n",
"mean 0.514000 \n",
"std 0.500054 \n",
"min 0.000000 \n",
"25% 0.000000 \n",
"50% 1.000000 \n",
"75% 1.000000 \n",
"max 1.000000 "
]
},
"execution_count": 3,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"dataframe.describe()"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"Plot class balance."
]
},
{
"cell_type": "code",
"execution_count": 4,
"metadata": {},
"outputs": [
{
"data": {
"image/png": "",
"text/plain": [
""
]
},
"metadata": {},
"output_type": "display_data"
}
],
"source": [
"dataframe['Loan_Approval'].value_counts().plot(kind='bar');"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"Most of the columns have missing values, which will have to be handled in the data preprocessing. The unnamed column contains observation indexes and can be dropped. All of the other columns appear to contain useful information. The only categorical column is *Had_Past_Default*, which is already encoded as 0s and 1s, so no additional processing is needed. Finally, the classes appear to be well balanced so balancing techniques will probably not be needed."
]
},
{
"cell_type": "code",
"execution_count": 5,
"metadata": {},
"outputs": [
{
"data": {
"text/html": [
"\n",
"\n",
"
\n",
" \n",
" \n",
" \n",
" Age \n",
" Annual_Income \n",
" Credit_Score \n",
" Loan_Amount \n",
" Loan_Duration_Years \n",
" Number_of_Open_Accounts \n",
" Had_Past_Default \n",
" Loan_Approval \n",
" \n",
" \n",
" \n",
" \n",
" count \n",
" 1000.000000 \n",
" 1000.000000 \n",
" 1000.00000 \n",
" 1000.000000 \n",
" 1000.000000 \n",
" 1000.00000 \n",
" 1000.00000 \n",
" 1000.000000 \n",
" \n",
" \n",
" mean \n",
" 42.140000 \n",
" 110123.298000 \n",
" 551.83200 \n",
" 27500.495000 \n",
" 14.832000 \n",
" 7.27500 \n",
" 0.51000 \n",
" 0.514000 \n",
" \n",
" \n",
" std \n",
" 17.748392 \n",
" 52808.112624 \n",
" 188.77845 \n",
" 13420.465049 \n",
" 8.424057 \n",
" 4.01441 \n",
" 0.50015 \n",
" 0.500054 \n",
" \n",
" \n",
" min \n",
" 0.000000 \n",
" 0.000000 \n",
" 0.00000 \n",
" 0.000000 \n",
" 1.000000 \n",
" 0.00000 \n",
" 0.00000 \n",
" 0.000000 \n",
" \n",
" \n",
" 25% \n",
" 29.000000 \n",
" 65156.750000 \n",
" 425.75000 \n",
" 17015.500000 \n",
" 8.000000 \n",
" 4.00000 \n",
" 0.00000 \n",
" 0.000000 \n",
" \n",
" \n",
" 50% \n",
" 44.000000 \n",
" 110510.000000 \n",
" 567.50000 \n",
" 27702.000000 \n",
" 15.000000 \n",
" 7.00000 \n",
" 1.00000 \n",
" 1.000000 \n",
" \n",
" \n",
" 75% \n",
" 57.000000 \n",
" 158513.750000 \n",
" 699.00000 \n",
" 38485.750000 \n",
" 22.000000 \n",
" 11.00000 \n",
" 1.00000 \n",
" 1.000000 \n",
" \n",
" \n",
" max \n",
" 69.000000 \n",
" 199991.000000 \n",
" 849.00000 \n",
" 49989.000000 \n",
" 29.000000 \n",
" 14.00000 \n",
" 1.00000 \n",
" 1.000000 \n",
" \n",
" \n",
"
\n",
"
"
],
"text/plain": [
" Age Annual_Income Credit_Score Loan_Amount \\\n",
"count 1000.000000 1000.000000 1000.00000 1000.000000 \n",
"mean 42.140000 110123.298000 551.83200 27500.495000 \n",
"std 17.748392 52808.112624 188.77845 13420.465049 \n",
"min 0.000000 0.000000 0.00000 0.000000 \n",
"25% 29.000000 65156.750000 425.75000 17015.500000 \n",
"50% 44.000000 110510.000000 567.50000 27702.000000 \n",
"75% 57.000000 158513.750000 699.00000 38485.750000 \n",
"max 69.000000 199991.000000 849.00000 49989.000000 \n",
"\n",
" Loan_Duration_Years Number_of_Open_Accounts Had_Past_Default \\\n",
"count 1000.000000 1000.00000 1000.00000 \n",
"mean 14.832000 7.27500 0.51000 \n",
"std 8.424057 4.01441 0.50015 \n",
"min 1.000000 0.00000 0.00000 \n",
"25% 8.000000 4.00000 0.00000 \n",
"50% 15.000000 7.00000 1.00000 \n",
"75% 22.000000 11.00000 1.00000 \n",
"max 29.000000 14.00000 1.00000 \n",
"\n",
" Loan_Approval \n",
"count 1000.000000 \n",
"mean 0.514000 \n",
"std 0.500054 \n",
"min 0.000000 \n",
"25% 0.000000 \n",
"50% 1.000000 \n",
"75% 1.000000 \n",
"max 1.000000 "
]
},
"execution_count": 5,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"dataframe.drop(columns=[\"Unnamed: 0\"], inplace=True)\n",
"dataframe.fillna(0, inplace=True)\n",
"dataframe.describe()"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"The preprocessed dataset is split (80% train, 20% test)."
]
},
{
"cell_type": "code",
"execution_count": 6,
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"(800, 7)"
]
},
"execution_count": 6,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"target_variable = \"Loan_Approval\"\n",
"X_train, X_test, y_train, y_test = train_test_split(dataframe.drop(columns=[target_variable]), dataframe[target_variable], test_size=0.2, shuffle=True, random_state=1337)\n",
"X_train.shape"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"A simple hyperparameter search is run over a gradient boosting classifier."
]
},
{
"cell_type": "code",
"execution_count": 7,
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"Best parameters: {'classifier__n_estimators': np.int64(168), 'classifier__max_depth': np.int64(6), 'classifier__learning_rate': np.float64(0.001)}\n",
"Best score: 0.5349999999999999\n"
]
}
],
"source": [
"categorical_feature = \"Had_Past_Default\"\n",
"numerical_features = [category for category in X_train.columns if category != categorical_feature]\n",
"preprocessor = ColumnTransformer(\n",
" transformers=[\n",
" ('numerical', StandardScaler(), numerical_features),\n",
" ('categorical', 'passthrough', [categorical_feature])\n",
" ])\n",
"classifier = GradientBoostingClassifier()\n",
"pipeline = Pipeline(steps=[('preprocessor', preprocessor), ('classifier', classifier)])\n",
"\n",
"param_distributions = {\n",
" 'classifier__learning_rate': np.logspace(-3, 0, num=20),\n",
" 'classifier__n_estimators': np.linspace(50, 300, num=20, dtype=int),\n",
" 'classifier__max_depth': np.linspace(3, 10, num=20, dtype=int),\n",
"}\n",
"search = RandomizedSearchCV(pipeline, param_distributions, n_iter=40, cv=5, verbose=0, random_state=1337, n_jobs=-1)\n",
"search.fit(X_train, y_train)\n",
"\n",
"best_params = search.best_params_\n",
"print(f\"Best parameters: {best_params}\")\n",
"print(f\"Best score: {search.best_score_}\")"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"The final model is trained with the hyperparameters and tested."
]
},
{
"cell_type": "code",
"execution_count": 8,
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
" precision recall f1-score support\n",
"\n",
" 0 0.45 0.18 0.25 102\n",
" 1 0.47 0.78 0.59 98\n",
"\n",
" accuracy 0.47 200\n",
" macro avg 0.46 0.48 0.42 200\n",
"weighted avg 0.46 0.47 0.42 200\n",
"\n"
]
}
],
"source": [
"pipeline.set_params(**best_params)\n",
"pipeline.fit(X_train, y_train)\n",
"y_pred = pipeline.predict(X_test)\n",
"print(classification_report(y_test, y_pred))"
]
},
{
"cell_type": "code",
"execution_count": 9,
"metadata": {},
"outputs": [
{
"data": {
"image/png": "",
"text/plain": [
""
]
},
"metadata": {},
"output_type": "display_data"
}
],
"source": [
"ConfusionMatrixDisplay.from_predictions(y_test, y_pred, normalize='true', cmap='Blues', colorbar=False);"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"With the model trained, we can also run various interpretability tests in case we wish to change the training procedure."
]
},
{
"cell_type": "code",
"execution_count": 10,
"metadata": {},
"outputs": [
{
"data": {
"image/png": "",
"text/plain": [
""
]
},
"metadata": {},
"output_type": "display_data"
}
],
"source": [
"best_features = pipeline.named_steps['classifier'].feature_importances_\n",
"best_features = pd.Series(best_features, index=X_train.columns)\n",
"best_features /= best_features.sum()\n",
"best_features_cumulative = best_features.sort_values(ascending=False).cumsum()\n",
"best_features.sort_values(inplace=True)\n",
"\n",
"fig, ax = plt.subplots(1, 2, figsize=(20, 10))\n",
"\n",
"best_features.plot(kind=\"barh\", ax=ax[0])\n",
"ax[0].set_title(\"Feature importance\")\n",
"ax[0].grid(axis=\"x\", which=\"both\", color=\"black\", linestyle=\"--\", linewidth=0.5)\n",
"ax[0].set_axisbelow(True)\n",
"ax[0].set_xlabel(\"Importance\")\n",
"ax[0].set_ylabel(\"Feature\")\n",
"\n",
"ax[1].stem(best_features_cumulative.index, best_features_cumulative)\n",
"ax[1].set_title(\"Cumulative feature importance\")\n",
"ax[1].grid(axis=\"y\", which=\"both\", color=\"black\", linestyle=\"--\", linewidth=0.5)\n",
"ax[1].set_axisbelow(True)\n",
"ax[1].set_xlabel(\"Features\")\n",
"ax[1].set_ylabel(\"Cumulative importance\")\n",
"ax[1].set_xticks(\n",
" rotation=\"vertical\",\n",
" ticks=range(len(best_features_cumulative)),\n",
" labels=best_features_cumulative.index,\n",
")\n",
"plt.show()"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"## Extra: Using MLflow\n",
"\n",
"Optionally, if you are using the JupyterLab interface included in the architecture ([localhost:8085](http://localhost:8085)), you can register the model above with mlflow experiments and runs, as presented bellow."
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"import mlflow\n",
"\n",
"os.environ[\"MLFLOW_S3_ENDPOINT_URL\"] = \"http://minio:8081\"\n",
"os.environ[\"AWS_ACCESS_KEY_ID\"] = \"access2024minio\"\n",
"os.environ[\"AWS_SECRET_ACCESS_KEY\"] = \"supersecretaccess2024\"\n",
"mlflow.set_tracking_uri(\"http://mlflow:8083\")\n",
"mlflow.set_experiment(\"mlflow_tracking_model\")\n",
"mlflow.sklearn.autolog(\n",
" log_model_signatures=True,\n",
" log_input_examples=True,\n",
" registered_model_name=\"clients_model\",\n",
")\n",
"\n",
"with mlflow.start_run(run_name=\"autolog_pipe_model_reg\") as run:\n",
" pipeline.fit(X_train, y_train)\n",
" # ...\n",
" # mlflow.sklearn.log_model...\n",
" # mlflow.log_metric..."
]
}
],
"metadata": {
"kernelspec": {
"display_name": "Python 3 (ipykernel)",
"language": "python",
"name": "python3"
},
"language_info": {
"codemirror_mode": {
"name": "ipython",
"version": 3
},
"file_extension": ".py",
"mimetype": "text/x-python",
"name": "python",
"nbconvert_exporter": "python",
"pygments_lexer": "ipython3",
"version": "3.12.6"
}
},
"nbformat": 4,
"nbformat_minor": 4
}