data-driven-init/tabular_classification/digits.ipynb
2025-05-29 21:02:27 -05:00

18 KiB

Import libraries and read data

In [ ]:
import logging
logging.getLogger("lightning.pytorch.utilities.rank_zero").setLevel(logging.WARNING)
logging.getLogger("lightning.pytorch.accelerators.cuda").setLevel(logging.WARNING)
from time import time
from pandas import DataFrame
from sklearn.datasets import load_digits
from sklearn.model_selection import train_test_split
from torch import cuda
from lightning import Trainer
from lightning.pytorch.loggers import CSVLogger
from ideal_init.lightning_model import NNClassifier, merge_logs

# Define training parameters
BATCH_SIZE = 256
LEARNING_RATE = 1e-1
HIDDEN_SIZES = (10,)
DIRECTORY = "./"
EPOCHS = 10
DATASET_NAME = "Handwritten digits"

def experiment_data():
    X, y = load_digits(return_X_y=True)
    # Split data: 70% train, 10% validation, 20% test
    X, X_test, y, y_test = train_test_split(X, y, test_size=0.2)
    X_train, X_val, y_train, y_val = train_test_split(X, y, test_size=0.125)
    return X_train, y_train, X_val, y_val, X_test, y_test

Run experiments and measure performance

In [ ]:
# keep record of the results of all the iterations
values=["IDEAL_acc_before_train", "IDEAL_acc_after_train", "IDEAL_init_time", "IDEAL_train_time", "He_acc_before_train", "He_acc_after_train", "He_init_time", "He_train_time"]
results = {value: [] for value in values}

# repeat experiment multiple times
NUM_EXPERIMENTS = 10
for experiment in range(NUM_EXPERIMENTS):

    print(f"Running experiment {experiment+1}")
    X_train, y_train, X_val, y_val, X_test, y_test = experiment_data()

    # Create models
    init_model = NNClassifier(X_train, y_train, X_val, y_val, X_test, y_test, initialize=True, hidden_sizes=HIDDEN_SIZES, learning_rate=LEARNING_RATE, batch_size=BATCH_SIZE)
    no_init_model = NNClassifier(X_train, y_train, X_val, y_val, X_test, y_test, initialize=False, hidden_sizes=HIDDEN_SIZES, learning_rate=LEARNING_RATE, batch_size=BATCH_SIZE)

    # Create trainers
    init_trainer = Trainer(default_root_dir=DIRECTORY, accelerator="auto", devices="auto", max_epochs=EPOCHS, logger=CSVLogger(save_dir=DIRECTORY), enable_checkpointing=False, enable_progress_bar=False, enable_model_summary=False, val_check_interval=1, log_every_n_steps=1, limit_val_batches=1, precision=64)
    no_init_trainer = Trainer(default_root_dir=DIRECTORY, accelerator="auto", devices="auto", max_epochs=EPOCHS, logger=CSVLogger(save_dir=DIRECTORY), enable_checkpointing=False, enable_progress_bar=False, enable_model_summary=False, val_check_interval=1, log_every_n_steps=1, limit_val_batches=1, precision=64)

    # Test models before training
    results["IDEAL_acc_before_train"].append(init_trainer.test(init_model, verbose=False)[0]["test_metric"])
    results["He_acc_before_train"].append(no_init_trainer.test(no_init_model, verbose=False)[0]["test_metric"])

    # Train models and plot training comparison
    init_time = time()
    init_trainer.validate(init_model)
    init_trainer.fit(init_model)
    init_time = time() - init_time
    init_model_logs = f"{init_trainer.logger.log_dir}/metrics.csv"

    no_init_time = time()
    no_init_trainer.validate(no_init_model)
    no_init_trainer.fit(no_init_model)
    no_init_time = time() - no_init_time
    no_init_model_logs = f"{no_init_trainer.logger.log_dir}/metrics.csv"

    logs = merge_logs(init_model_logs, no_init_model_logs)
    logs["dataset"] = DATASET_NAME

    # Test models after training
    results["IDEAL_acc_after_train"].append(init_trainer.test(init_model, verbose=False)[0]["test_metric"])
    results["He_acc_after_train"].append(no_init_trainer.test(no_init_model, verbose=False)[0]["test_metric"])

    # Init and train times
    results["IDEAL_init_time"].append(init_model.init_time)
    results["IDEAL_train_time"].append(init_time)
    results["He_init_time"].append(no_init_model.init_time)
    results["He_train_time"].append(no_init_time)

    # store logs
    with open("results.csv", "a", encoding="utf8") as results_file:
        logs.to_csv(results_file, header=False, index=False, quoting=1)

    #clear cache
    del init_model, no_init_model
    del init_trainer, no_init_trainer
    del X_train, y_train, X_val, y_val, X_test, y_test
    del logs
    cuda.empty_cache()

results = DataFrame(results)
mean = results.mean()
error = results.sem()*1.96
print("Mean values:")
print(mean.astype(str).apply(lambda x: x[:6]))
print("Confidence error:")
print(error.astype(str).apply(lambda x: x[:6]))