In [1]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session
/kaggle/input/playing-the-stock-market-is-a-fools-game/sample_submission.csv
/kaggle/input/playing-the-stock-market-is-a-fools-game/train.csv

Preliminary¶

In [2]:
!pip install optuna
!pip install captum
Requirement already satisfied: optuna in /usr/local/lib/python3.10/dist-packages (4.2.1)
Requirement already satisfied: alembic>=1.5.0 in /usr/local/lib/python3.10/dist-packages (from optuna) (1.14.1)
Requirement already satisfied: colorlog in /usr/local/lib/python3.10/dist-packages (from optuna) (6.9.0)
Requirement already satisfied: numpy in /usr/local/lib/python3.10/dist-packages (from optuna) (1.26.4)
Requirement already satisfied: packaging>=20.0 in /usr/local/lib/python3.10/dist-packages (from optuna) (24.2)
Requirement already satisfied: sqlalchemy>=1.4.2 in /usr/local/lib/python3.10/dist-packages (from optuna) (2.0.36)
Requirement already satisfied: tqdm in /usr/local/lib/python3.10/dist-packages (from optuna) (4.67.1)
Requirement already satisfied: PyYAML in /usr/local/lib/python3.10/dist-packages (from optuna) (6.0.2)
Requirement already satisfied: Mako in /usr/local/lib/python3.10/dist-packages (from alembic>=1.5.0->optuna) (1.3.9)
Requirement already satisfied: typing-extensions>=4 in /usr/local/lib/python3.10/dist-packages (from alembic>=1.5.0->optuna) (4.12.2)
Requirement already satisfied: greenlet!=0.4.17 in /usr/local/lib/python3.10/dist-packages (from sqlalchemy>=1.4.2->optuna) (3.1.1)
Requirement already satisfied: mkl_fft in /usr/local/lib/python3.10/dist-packages (from numpy->optuna) (1.3.8)
Requirement already satisfied: mkl_random in /usr/local/lib/python3.10/dist-packages (from numpy->optuna) (1.2.4)
Requirement already satisfied: mkl_umath in /usr/local/lib/python3.10/dist-packages (from numpy->optuna) (0.1.1)
Requirement already satisfied: mkl in /usr/local/lib/python3.10/dist-packages (from numpy->optuna) (2025.0.1)
Requirement already satisfied: tbb4py in /usr/local/lib/python3.10/dist-packages (from numpy->optuna) (2022.0.0)
Requirement already satisfied: mkl-service in /usr/local/lib/python3.10/dist-packages (from numpy->optuna) (2.4.1)
Requirement already satisfied: MarkupSafe>=0.9.2 in /usr/local/lib/python3.10/dist-packages (from Mako->alembic>=1.5.0->optuna) (3.0.2)
Requirement already satisfied: intel-openmp>=2024 in /usr/local/lib/python3.10/dist-packages (from mkl->numpy->optuna) (2024.2.0)
Requirement already satisfied: tbb==2022.* in /usr/local/lib/python3.10/dist-packages (from mkl->numpy->optuna) (2022.0.0)
Requirement already satisfied: tcmlib==1.* in /usr/local/lib/python3.10/dist-packages (from tbb==2022.*->mkl->numpy->optuna) (1.2.0)
Requirement already satisfied: intel-cmplr-lib-rt in /usr/local/lib/python3.10/dist-packages (from mkl_umath->numpy->optuna) (2024.2.0)
Requirement already satisfied: intel-cmplr-lib-ur==2024.2.0 in /usr/local/lib/python3.10/dist-packages (from intel-openmp>=2024->mkl->numpy->optuna) (2024.2.0)
Collecting captum
  Downloading captum-0.8.0-py3-none-any.whl.metadata (26 kB)
Requirement already satisfied: matplotlib in /usr/local/lib/python3.10/dist-packages (from captum) (3.7.5)
Requirement already satisfied: numpy<2.0 in /usr/local/lib/python3.10/dist-packages (from captum) (1.26.4)
Requirement already satisfied: packaging in /usr/local/lib/python3.10/dist-packages (from captum) (24.2)
Requirement already satisfied: torch>=1.10 in /usr/local/lib/python3.10/dist-packages (from captum) (2.5.1+cu121)
Requirement already satisfied: tqdm in /usr/local/lib/python3.10/dist-packages (from captum) (4.67.1)
Requirement already satisfied: mkl_fft in /usr/local/lib/python3.10/dist-packages (from numpy<2.0->captum) (1.3.8)
Requirement already satisfied: mkl_random in /usr/local/lib/python3.10/dist-packages (from numpy<2.0->captum) (1.2.4)
Requirement already satisfied: mkl_umath in /usr/local/lib/python3.10/dist-packages (from numpy<2.0->captum) (0.1.1)
Requirement already satisfied: mkl in /usr/local/lib/python3.10/dist-packages (from numpy<2.0->captum) (2025.0.1)
Requirement already satisfied: tbb4py in /usr/local/lib/python3.10/dist-packages (from numpy<2.0->captum) (2022.0.0)
Requirement already satisfied: mkl-service in /usr/local/lib/python3.10/dist-packages (from numpy<2.0->captum) (2.4.1)
Requirement already satisfied: filelock in /usr/local/lib/python3.10/dist-packages (from torch>=1.10->captum) (3.17.0)
Requirement already satisfied: typing-extensions>=4.8.0 in /usr/local/lib/python3.10/dist-packages (from torch>=1.10->captum) (4.12.2)
Requirement already satisfied: networkx in /usr/local/lib/python3.10/dist-packages (from torch>=1.10->captum) (3.4.2)
Requirement already satisfied: jinja2 in /usr/local/lib/python3.10/dist-packages (from torch>=1.10->captum) (3.1.4)
Requirement already satisfied: fsspec in /usr/local/lib/python3.10/dist-packages (from torch>=1.10->captum) (2024.12.0)
Requirement already satisfied: sympy==1.13.1 in /usr/local/lib/python3.10/dist-packages (from torch>=1.10->captum) (1.13.1)
Requirement already satisfied: mpmath<1.4,>=1.1.0 in /usr/local/lib/python3.10/dist-packages (from sympy==1.13.1->torch>=1.10->captum) (1.3.0)
Requirement already satisfied: contourpy>=1.0.1 in /usr/local/lib/python3.10/dist-packages (from matplotlib->captum) (1.3.1)
Requirement already satisfied: cycler>=0.10 in /usr/local/lib/python3.10/dist-packages (from matplotlib->captum) (0.12.1)
Requirement already satisfied: fonttools>=4.22.0 in /usr/local/lib/python3.10/dist-packages (from matplotlib->captum) (4.55.3)
Requirement already satisfied: kiwisolver>=1.0.1 in /usr/local/lib/python3.10/dist-packages (from matplotlib->captum) (1.4.7)
Requirement already satisfied: pillow>=6.2.0 in /usr/local/lib/python3.10/dist-packages (from matplotlib->captum) (11.0.0)
Requirement already satisfied: pyparsing>=2.3.1 in /usr/local/lib/python3.10/dist-packages (from matplotlib->captum) (3.2.0)
Requirement already satisfied: python-dateutil>=2.7 in /usr/local/lib/python3.10/dist-packages (from matplotlib->captum) (2.9.0.post0)
Requirement already satisfied: six>=1.5 in /usr/local/lib/python3.10/dist-packages (from python-dateutil>=2.7->matplotlib->captum) (1.17.0)
Requirement already satisfied: MarkupSafe>=2.0 in /usr/local/lib/python3.10/dist-packages (from jinja2->torch>=1.10->captum) (3.0.2)
Requirement already satisfied: intel-openmp>=2024 in /usr/local/lib/python3.10/dist-packages (from mkl->numpy<2.0->captum) (2024.2.0)
Requirement already satisfied: tbb==2022.* in /usr/local/lib/python3.10/dist-packages (from mkl->numpy<2.0->captum) (2022.0.0)
Requirement already satisfied: tcmlib==1.* in /usr/local/lib/python3.10/dist-packages (from tbb==2022.*->mkl->numpy<2.0->captum) (1.2.0)
Requirement already satisfied: intel-cmplr-lib-rt in /usr/local/lib/python3.10/dist-packages (from mkl_umath->numpy<2.0->captum) (2024.2.0)
Requirement already satisfied: intel-cmplr-lib-ur==2024.2.0 in /usr/local/lib/python3.10/dist-packages (from intel-openmp>=2024->mkl->numpy<2.0->captum) (2024.2.0)
Downloading captum-0.8.0-py3-none-any.whl (1.4 MB)
   ━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━ 1.4/1.4 MB 35.1 MB/s eta 0:00:0000:01
Installing collected packages: captum
Successfully installed captum-0.8.0

Library¶

In [3]:
import matplotlib.pyplot as plt
%matplotlib inline

from sklearn.preprocessing import MinMaxScaler

import torch
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import Dataset, DataLoader
from torch.nn.utils import clip_grad_norm_ as clip

import optuna
from optuna.visualization import plot_param_importances

from captum.attr import IntegratedGradients

DEVICE = torch.device("cuda" if torch.cuda.is_available() else "cpu")

1. Preprocess¶

Preprocessing is a crucial stage as it helps us to understanding the dataset. This section also implement some steps such as explore, clean, and transform the data so that we have a further analysis. First, we import the given data from kaggle competition, and then format data with rows stand for "Date" and columns as "Companies". We also check the missing values and summary first 10 companies.

The chart below displays the daily returns for 442 companies from 05/04/2010 to 31/03/2022, indicating that most fluctuate within a band of approximately -20% to +20%. Such bounded variability suggests these returns are likely stationary, with no clear trend over time. We also observe a negative shock in the first quater of 2020, implying COVID-19 pandemic effect.

In this project, my approach is using MinMaxScaler for normalization to mitigate the effect of extreme values. This is because data still have the true zero values. Afterwards, I split the dataset into out-of-sample (training set) and in-sample (test set) using rolling window. The ratio is 80% for training and 20% for testing, sparated by a threshold day.

In [4]:
# | --------------------------|
# | 1.1 Import data           |
# | --------------------------|
train = pd.read_csv('/kaggle/input/playing-the-stock-market-is-a-fools-game/train.csv', 
                    index_col='ID')

print('Overview:\n'+'-'*30)
print(f'  train.shape: \t{train.shape}')
print(f'  Total companies: \t{len(train.index.unique())}')
print('='*30)

# | --------------------------|
# | 1.2 Format data           |
# | --------------------------|
train = train.T
train.index.name = 'Date'
train.index = pd.to_datetime(train.index, format = '%d/%m/%Y')

print('\nTotal NAs:\n'+'-'*30+f'\n{train.isna().sum()}')
print('='*30)

# | --------------------------|
# | 1.3 Summary data          |
# | --------------------------|
summary_stats = train.describe()
print("\nSummary Statistics for First 10 Companies\n"+'-'*70)
print(summary_stats.iloc[:, :10])
print('='*70)

# | --------------------------|
# | 1.4 Visualization         |
# | --------------------------|
plt.figure(figsize=(15, 8))
for company in train.columns:
    plt.plot(train.index, train[company], alpha=0.5)
plt.xlabel("Date")
plt.ylabel("Daily percentage change")
plt.title("Return of 442 companies from 05/04/2010 to 31/03/2022")
plt.tight_layout()
plt.show()

# | --------------------------|
# | 1.5 Normalization         |
# | --------------------------|
scaler = MinMaxScaler()
data = scaler.fit_transform(train.values)

# | --------------------------|
# | 1.6 Split ratio           |
# | --------------------------|
thres_date = int(len(data) * 0.8)
train_set = data[:thres_date, :]
test_set = data[thres_date:, :]

print("\nData includes:\n"+'-'*30)
print(f'Train shape: \t{train_set.shape}')
print(f'Test shape: \t{test_set.shape}')
print('='*30)
Overview:
------------------------------
  train.shape: 	(442, 3021)
  Total companies: 	442
==============================

Total NAs:
------------------------------
ID
company_0      0
company_1      0
company_2      0
company_3      0
company_4      0
              ..
company_437    0
company_438    0
company_439    0
company_440    0
company_441    0
Length: 442, dtype: int64
==============================

Summary Statistics for First 10 Companies
----------------------------------------------------------------------
ID       company_0    company_1    company_2    company_3    company_4  \
count  3021.000000  3021.000000  3021.000000  3021.000000  3021.000000   
mean      0.021781    -0.019361     0.029437     0.048183     0.043363   
std       1.695075     3.130242     1.755380     1.984768     1.853193   
min     -19.350000   -43.450000   -15.030000   -19.270000   -25.860000   
25%      -0.660000    -1.360000    -0.780000    -0.830000    -0.800000   
50%       0.050000     0.030000     0.080000     0.110000     0.060000   
75%       0.800000     1.400000     0.900000     1.000000     0.940000   
max      13.760000    20.450000    15.370000    15.050000    17.410000   

ID       company_5    company_6    company_7    company_8    company_9  
count  3021.000000  3021.000000  3021.000000  3021.000000  3021.000000  
mean     -0.025650    -0.039649     0.032939    -0.034052     0.014386  
std       2.638759     3.042905     2.122935     3.312811     2.277523  
min     -41.380000   -59.740000   -19.130000   -88.150000   -31.320000  
25%      -1.060000    -1.360000    -0.920000    -1.390000    -0.880000  
50%       0.080000     0.000000     0.080000     0.030000     0.070000  
75%       1.120000     1.360000     1.110000     1.450000     1.000000  
max      22.890000    17.400000    13.460000    18.860000    19.560000  
======================================================================
No description has been provided for this image
Data includes:
------------------------------
Train shape: 	(2416, 442)
Test shape: 	(605, 442)
==============================

2. Data loader¶

I created rolling windows function, adding to the function create time series with form of tensors. Early stopping is used during training to prevent overfitting and reduce computation time.

In [5]:
# | --------------------------|
# | 2.1 Rolling windows       |
# | --------------------------|
def rolling_windows(data, seq_length):
    X, y = [], []
    for i in range(len(data) - seq_length):
        X.append(data[i : (i + seq_length)])
        y.append(data[i + seq_length])
    return np.array(X), np.array(y)
    
# | --------------------------|
# | 2.2 Custom dataset        |
# | --------------------------|
class TimeSeries(Dataset):
    def __init__(self, data, seq_length):
        self.X, self.y = rolling_windows(data, seq_length)
    def __len__(self):
        return len(self.X)
    def __getitem__(self, idx):
        return (torch.tensor(self.X[idx], dtype=torch.float32),
                torch.tensor(self.y[idx], dtype=torch.float32))

# | --------------------------|
# | 2.3 Early stopping        |
# | --------------------------|
class EarlyStopping:
    def __init__(self, patience=20, min_delta=1e-4):
        self.patience = patience
        self.min_delta = min_delta
        self.best_loss = None
        self.counter = 0

    def step(self, current_loss):
        if self.best_loss is None or current_loss < (self.best_loss - self.min_delta):
            self.best_loss = current_loss
            self.counter = 0
            return False  # Don't stop
        else:
            self.counter += 1
            return self.counter >= self.patience

3. Model architecture¶

That is my final LSTM model. The model will be the best parameters from tuning hyperparameters of Optuna. I will decribe in the next part. Basically, it's include simple LSTM with some layers and hidden dimensions. I also added dropout layer for deep LSTM (layers > 1). If LSTM with layer = 1, dropout ratio will be 0 thus we do not need to care about dropout at this case. I will fixed some hyperparameters I believe that is the best parameters (as commented the trial).

After tuning the hyperparameters, I found that the LSTM model with 2 layers performed better than other configurations. Therefore, I chose to use 2 layers in the final model setup.

In [6]:
def my_LSTM(best_params, input_dim, output_dim):
    num_layers = 2 # best_params["num_layers"]
    hidden_dim = best_params["hidden_dim"]
    dropout = best_params["dropout"]

    class LSTM(nn.Module):
        def __init__(self, input_dim, hidden_dim, output_dim, num_layers, dropout):
            super(LSTM, self).__init__()
            self.input_dim = input_dim
            self.hidden_dim = hidden_dim
            self.num_layers = num_layers

            self.rnn = nn.LSTM(
                input_size=input_dim,
                hidden_size=hidden_dim,
                num_layers=num_layers,
                batch_first=False,
                dropout = dropout if num_layers > 1 else 0.0
            )
            self.linear = nn.Linear(hidden_dim, output_dim)

        def init_hidden(self, batch_size, device):
            return (torch.zeros(self.num_layers, 
                                batch_size, 
                                self.hidden_dim, 
                                device = device),
                    torch.zeros(self.num_layers, 
                                batch_size, 
                                self.hidden_dim, 
                                device = device))

        def forward(self, x):
            batch_size = x.size(1)
            hidden = self.init_hidden(batch_size, x.device)
            y, hidden = self.rnn(x, hidden)
            y = self.linear(y)
            return y, hidden

    model = LSTM(input_dim, hidden_dim, output_dim, num_layers, dropout)
    return model

4. Hyperparameters tuning with Optuna¶

About structure of model, some hyperparameters will be tuned as defined:

  • Number of layers
  • Number of features in hidden state
  • Dropout layer

In terms of objective function and window size:

  • Sequence length
  • Weigh decay (L2)
  • Gradient clipping

Some fixed parameters I used:

  • Batch size (Do not shuffle because of time series chracteristic)
  • Early stop
  • Epoch
  • Optimizer is Adam

I also add Gaussian noise ~ N(0, 0.0001) into training step.

In [7]:
def define_lstm_model(trial, input_dim, output_dim):
    num_layers = 2 # trial.suggest_int("num_layers", 1, 3)
    hidden_dim = trial.suggest_int("hidden_dim", 300, 350)
    dropout = trial.suggest_float("dropout", 0.15, 0.18)

    class LSTM(nn.Module):
        def __init__(self, input_dim, hidden_dim, output_dim, num_layers, dropout):
            super(LSTM, self).__init__()
            self.rnn = nn.LSTM(
                input_size=input_dim,
                hidden_size=hidden_dim,
                num_layers=num_layers,
                batch_first=False,
                dropout = dropout if num_layers > 1 else 0.0
            )
            self.linear = nn.Linear(hidden_dim, output_dim)

        def forward(self, x):
            out, _ = self.rnn(x)
            out = self.linear(out) 
            return out

    return LSTM(input_dim, hidden_dim, output_dim, num_layers, dropout)
In [8]:
def objective(trial):
# | --------------------------|
# | 4.1 Hyperparameters       |
# | --------------------------|
    seq_length = 24 # trial.suggest_int("seq_length", 23, 31)
    
    weight_decay = trial.suggest_float("weight_decay", 1e-6, 1e-4, log=True)
    lr = trial.suggest_float("lr", 1e-5, 1e-3, log=True)
    clip_value = trial.suggest_float("clip_value", 1.5, 2.0, log=True)

    batch_size = 64
    early_stopper = EarlyStopping(patience=20, min_delta=1e-4)

# | --------------------------|
# | 4.2 Define the model      |
# | --------------------------|
    input_dim = train_set.shape[1]  # e.g. 432 companies
    output_dim = train_set.shape[1]
    model = define_lstm_model(trial, input_dim, output_dim).to(DEVICE)

    optimizer = optim.Adam(model.parameters(), lr=lr, weight_decay=weight_decay)
    criterion = nn.MSELoss()
    EPOCHS = 100  

# | --------------------------|
# | 4.3 Window for train      |
# | --------------------------|
    train_dataset = TimeSeries(train_set, seq_length)
    train_loader = DataLoader(train_dataset, batch_size=batch_size, shuffle=False)

# | --------------------------|
# | 4.4 Training loops        |
# | --------------------------|
    for epoch in range(EPOCHS):
        model.train()
        epoch_loss = 0.0
        for X_batch, y_batch in train_loader:
            # X_batch: (batch_size, seq_length, input_dim)
            # Transpose to (seq_length, batch_size, input_dim)
            X_batch = X_batch.transpose(0, 1).to(DEVICE)
            y_batch = y_batch.to(DEVICE)
        
            optimizer.zero_grad()

            # Add white noise epsilon ~ N(0,0.0001)
            X_noisy = X_batch + torch.randn_like(X_batch) * 0.01
        
            outputs = model(X_noisy)
      
            # Compare final time step to y
            loss = criterion(outputs[-1], y_batch)

            loss.backward()
        
            # Gradient clipping
            clip(model.parameters(), clip_value)

            optimizer.step()
            epoch_loss += loss.item()
        avg_loss = epoch_loss / len(train_loader)
        trial.report(avg_loss, epoch)
        if early_stopper.step(avg_loss):
            print(f"Early stopping at epoch {epoch+1}")
            break
        if trial.should_prune():
            raise optuna.exceptions.TrialPruned()

# | --------------------------|
# | 4.5 Evaluate on test set  |
# | --------------------------|
    test_dataset = TimeSeries(test_set, seq_length)
    test_loader = DataLoader(test_dataset, batch_size=batch_size, shuffle=False)

    model.eval()
    total_loss = 0.0
    with torch.no_grad():
        for X_batch, y_batch in test_loader:
            X_batch = X_batch.transpose(0, 1).to(DEVICE)
            y_batch = y_batch.to(DEVICE)
            outputs = model(X_batch)
            loss = criterion(outputs[-1], y_batch)
            total_loss += loss.item()
    avg_test_loss = total_loss / len(test_loader)
    return avg_test_loss

I used many trials because of tuning various hyperparameters. The early stopper will save computional time if needed.

In [9]:
study = optuna.create_study(direction='minimize')
study.optimize(objective, n_trials=50, timeout=600)

print("Number of finished trials:", len(study.trials))
print("Best trial:", study.best_trial.params)
print("Best value (MSE):", study.best_trial.value)
[I 2025-04-11 14:00:30,504] A new study created in memory with name: no-name-5c359c16-4e72-4fa2-8b1c-da00e9e45394
[I 2025-04-11 14:00:50,465] Trial 0 finished with value: 0.005437457200605423 and parameters: {'weight_decay': 9.010696366541017e-05, 'lr': 3.2631921523822867e-05, 'clip_value': 1.866483325115156, 'hidden_dim': 329, 'dropout': 0.1754962395750958}. Best is trial 0 with value: 0.005437457200605423.
Early stopping at epoch 42
[I 2025-04-11 14:01:02,646] Trial 1 finished with value: 0.00541713695274666 and parameters: {'weight_decay': 1.6820759117508032e-06, 'lr': 0.00033567838742119805, 'clip_value': 1.8276086428463114, 'hidden_dim': 348, 'dropout': 0.15032483610439398}. Best is trial 1 with value: 0.00541713695274666.
Early stopping at epoch 30
[I 2025-04-11 14:01:15,396] Trial 2 finished with value: 0.005414817866403609 and parameters: {'weight_decay': 1.913899886289534e-06, 'lr': 7.877108245309796e-05, 'clip_value': 1.8749800376757997, 'hidden_dim': 311, 'dropout': 0.1556428663624871}. Best is trial 2 with value: 0.005414817866403609.
Early stopping at epoch 33
[I 2025-04-11 14:01:28,776] Trial 3 finished with value: 0.005433881666976959 and parameters: {'weight_decay': 3.0946875752567993e-06, 'lr': 1.1342870077167936e-05, 'clip_value': 1.8135418545573978, 'hidden_dim': 303, 'dropout': 0.16711117775015494}. Best is trial 2 with value: 0.005414817866403609.
Early stopping at epoch 35
[I 2025-04-11 14:01:40,502] Trial 4 finished with value: 0.005442619521636516 and parameters: {'weight_decay': 1.006123553176289e-05, 'lr': 0.0009483162432304677, 'clip_value': 1.6389921555239382, 'hidden_dim': 329, 'dropout': 0.16492601675200816}. Best is trial 2 with value: 0.005414817866403609.
Early stopping at epoch 29
[I 2025-04-11 14:01:40,958] Trial 5 pruned. 
[I 2025-04-11 14:01:51,021] Trial 6 finished with value: 0.005460354976821691 and parameters: {'weight_decay': 8.382687383884459e-05, 'lr': 0.0004026499915548599, 'clip_value': 1.715257939513186, 'hidden_dim': 309, 'dropout': 0.17611276292926648}. Best is trial 2 with value: 0.005414817866403609.
Early stopping at epoch 26
[I 2025-04-11 14:01:54,606] Trial 7 pruned. 
[I 2025-04-11 14:01:55,045] Trial 8 pruned. 
[I 2025-04-11 14:01:55,494] Trial 9 pruned. 
[I 2025-04-11 14:01:57,190] Trial 10 pruned. 
[I 2025-04-11 14:02:09,374] Trial 11 finished with value: 0.0054453269694931805 and parameters: {'weight_decay': 3.0016444449495126e-05, 'lr': 0.00030132863816897135, 'clip_value': 1.9587820715261755, 'hidden_dim': 348, 'dropout': 0.1503370159585747}. Best is trial 2 with value: 0.005414817866403609.
Early stopping at epoch 30
[I 2025-04-11 14:02:09,861] Trial 12 pruned. 
[I 2025-04-11 14:02:21,940] Trial 13 finished with value: 0.005440033378545195 and parameters: {'weight_decay': 1.8626673272768084e-05, 'lr': 0.0007836118097393293, 'clip_value': 1.7940476517086086, 'hidden_dim': 315, 'dropout': 0.16100015593572392}. Best is trial 2 with value: 0.005414817866403609.
Early stopping at epoch 31
[I 2025-04-11 14:02:22,438] Trial 14 pruned. 
[I 2025-04-11 14:02:22,913] Trial 15 pruned. 
[I 2025-04-11 14:02:23,411] Trial 16 pruned. 
[I 2025-04-11 14:02:24,647] Trial 17 pruned. 
[I 2025-04-11 14:02:25,136] Trial 18 pruned. 
[I 2025-04-11 14:02:25,627] Trial 19 pruned. 
[I 2025-04-11 14:02:26,082] Trial 20 pruned. 
[I 2025-04-11 14:02:26,536] Trial 21 pruned. 
[I 2025-04-11 14:02:27,001] Trial 22 pruned. 
[I 2025-04-11 14:02:27,472] Trial 23 pruned. 
[I 2025-04-11 14:02:27,956] Trial 24 pruned. 
[I 2025-04-11 14:02:28,449] Trial 25 pruned. 
[I 2025-04-11 14:02:28,916] Trial 26 pruned. 
[I 2025-04-11 14:02:40,335] Trial 27 finished with value: 0.0054313832661136985 and parameters: {'weight_decay': 4.012516584798842e-06, 'lr': 0.0006195392971702806, 'clip_value': 1.6569226926301694, 'hidden_dim': 303, 'dropout': 0.1527784251870268}. Best is trial 2 with value: 0.005414817866403609.
Early stopping at epoch 30
[I 2025-04-11 14:02:52,435] Trial 28 finished with value: 0.005447977152653038 and parameters: {'weight_decay': 4.424736427518117e-06, 'lr': 0.0006145175066730719, 'clip_value': 1.6563144546147344, 'hidden_dim': 313, 'dropout': 0.15372055261308065}. Best is trial 2 with value: 0.005414817866403609.
Early stopping at epoch 31
[I 2025-04-11 14:02:53,678] Trial 29 pruned. 
[I 2025-04-11 14:02:54,170] Trial 30 pruned. 
[I 2025-04-11 14:03:05,644] Trial 31 finished with value: 0.005459379591047764 and parameters: {'weight_decay': 7.437705227276176e-06, 'lr': 0.0006314143097492449, 'clip_value': 1.6087132706826734, 'hidden_dim': 303, 'dropout': 0.15503312609732145}. Best is trial 2 with value: 0.005414817866403609.
Early stopping at epoch 30
[I 2025-04-11 14:03:17,516] Trial 32 finished with value: 0.00545856540556997 and parameters: {'weight_decay': 3.65541440057519e-06, 'lr': 0.0008642340947939282, 'clip_value': 1.736418574215152, 'hidden_dim': 308, 'dropout': 0.16717370398748027}. Best is trial 2 with value: 0.005414817866403609.
Early stopping at epoch 31
[I 2025-04-11 14:03:18,000] Trial 33 pruned. 
[I 2025-04-11 14:03:18,462] Trial 34 pruned. 
[I 2025-04-11 14:03:30,108] Trial 35 finished with value: 0.00546645465074107 and parameters: {'weight_decay': 1.8229481749002745e-06, 'lr': 0.000997039294911953, 'clip_value': 1.7338512746237096, 'hidden_dim': 309, 'dropout': 0.17278615138964415}. Best is trial 2 with value: 0.005414817866403609.
Early stopping at epoch 30
[I 2025-04-11 14:03:30,617] Trial 36 pruned. 
[I 2025-04-11 14:03:31,083] Trial 37 pruned. 
[I 2025-04-11 14:03:31,548] Trial 38 pruned. 
[I 2025-04-11 14:03:32,032] Trial 39 pruned. 
[I 2025-04-11 14:03:32,503] Trial 40 pruned. 
[I 2025-04-11 14:03:33,026] Trial 41 pruned. 
[I 2025-04-11 14:03:33,535] Trial 42 pruned. 
[I 2025-04-11 14:03:34,024] Trial 43 pruned. 
[I 2025-04-11 14:03:34,498] Trial 44 pruned. 
[I 2025-04-11 14:03:34,962] Trial 45 pruned. 
[I 2025-04-11 14:03:35,432] Trial 46 pruned. 
[I 2025-04-11 14:03:35,935] Trial 47 pruned. 
[I 2025-04-11 14:03:36,430] Trial 48 pruned. 
[I 2025-04-11 14:03:44,356] Trial 49 pruned. 
Number of finished trials: 50
Best trial: {'weight_decay': 1.913899886289534e-06, 'lr': 7.877108245309796e-05, 'clip_value': 1.8749800376757997, 'hidden_dim': 311, 'dropout': 0.1556428663624871}
Best value (MSE): 0.005414817866403609

This step will report the important weight of each hyperparameters. Top hyperparameters in the charts will give me motivation to narrow down the range of hyperparameters. This is because the chart shows the degree of hyperparameters affect most our goal.

In [10]:
fig = plot_param_importances(study)
fig.show()

5. Training phrase¶

Using the best parameters from Optuna, we training data.

In [11]:
# | --------------------------|
# | 5.1 Define hyperparameters|
# | --------------------------|
best_params = study.best_trial.params
seq_length = 24 # best_params["seq_length"]

input_dim = train_set.shape[1]
output_dim = train_set.shape[1]

# Build final model using my_LSTM
final_model = my_LSTM(best_params, input_dim, output_dim).to(DEVICE)

lr = best_params["lr"]
weight_decay = best_params["weight_decay"]
clip_value = best_params["clip_value"]

final_optimizer = optim.Adam(final_model.parameters(), lr=lr, weight_decay=weight_decay)
criterion = nn.MSELoss()
EPOCHS_FINAL = 1000
batch_size = 64
early_stopper_final = EarlyStopping(patience=20, min_delta=1e-4)

# | --------------------------|
# | 5.2 Full training data    |
# | --------------------------|
train_dataset_full = TimeSeries(train_set, seq_length)
train_loader_full = DataLoader(train_dataset_full, batch_size=batch_size, shuffle=False)

print(f"Final Training Dataset: {len(train_dataset_full)} samples")

# | --------------------------|
# | 5.3 Training loops        |
# | --------------------------|
for epoch in range(EPOCHS_FINAL):
    final_model.train()
    epoch_loss = 0.0
    for X_batch, y_batch in train_loader_full:
        X_batch = X_batch.transpose(0, 1).to(DEVICE)
        y_batch = y_batch.to(DEVICE)

        final_optimizer.zero_grad()
        X_noisy = X_batch + torch.randn_like(X_batch) * 0.01

        outputs, _ = final_model(X_noisy)  
        loss = criterion(outputs[-1], y_batch)
        loss.backward()
    
        # Gradient clipping
        clip(final_model.parameters(), clip_value)
    
        final_optimizer.step()
        epoch_loss += loss.item()
    avg_loss = epoch_loss / len(train_loader_full)
    if (epoch + 1) % 100 == 0:
        print(f"Epoch {epoch+1}/{EPOCHS_FINAL} => Loss: {avg_loss:.6f}")
        
    if early_stopper_final.step(avg_loss):
        print(f"Early stopping at epoch {epoch+1}")
        print(f"Epoch {epoch+1}/{EPOCHS_FINAL} => Loss: {avg_loss:.6f}")
        break
Final Training Dataset: 2392 samples
Early stopping at epoch 34
Epoch 34/1000 => Loss: 0.002531

6. Evaluate on test set¶

Using the trained model, we evaluated on test set.

In [12]:
# | --------------------------|
# | 6.1 Full test data        |
# | --------------------------|
test_dataset_final = TimeSeries(test_set, seq_length)
test_loader_final = DataLoader(test_dataset_final, batch_size=batch_size, shuffle=False)

# | --------------------------|
# | 6.2 Evaluate on test set  |
# | --------------------------|
final_model.eval()
total_test_loss = 0.0
with torch.no_grad():
    for X_batch, y_batch in test_loader_final:
        X_batch = X_batch.transpose(0, 1).to(DEVICE)
        y_batch = y_batch.to(DEVICE)
        y_test_pred, _ = final_model(X_batch)
        loss = criterion(y_test_pred[-1], y_batch)
        total_test_loss += loss.item()
avg_test_loss = total_test_loss / len(test_loader_final)
print("Final Test MSE:", avg_test_loss)
Final Test MSE: 0.005414343986194581

7. Forecasting¶

In [13]:
# | --------------------------|
# | 7.1 Define last sequence  |
# | --------------------------|
last_window = test_set[-seq_length:]
print("Last window shape:", last_window.shape)  
last_window_t = torch.tensor(last_window, dtype=torch.float32).unsqueeze(1).to(DEVICE)

final_model.eval()
with torch.no_grad():
    out_seq, _ = final_model(last_window_t)
    next_day_pred_norm = out_seq[-1, 0].cpu().numpy()

# | --------------------------|
# | 7.2 Inverse to real values|
# | --------------------------|
next_day_pred = scaler.inverse_transform(next_day_pred_norm.reshape(1, -1)).flatten()
Last window shape: (24, 442)

8. Interpretation with Captum¶

This chart illustrates the final time step’s output from the LSTM, and then applies Integrated Gradients to assess each feature’s contribution relative to a zero baseline. It uses the last window of data to identify which lags most influenced the forecast. The accompanying chart displays the average integrated gradients attribution for each time step in the final window. Negative values suggest that the corresponding sequence times adversely affect the predicted value, while positive values indicate a same sign contribution.

In [14]:
# | --------------------------|
# | 8.1 Define forward pass   |
# | --------------------------|
class CaptumModel(nn.Module):
    def __init__(self, lstm_model, target_index=1):
        super().__init__()
        self.lstm_model = lstm_model
        self.target_index = target_index

    def forward(self, x):
        outputs, _ = self.lstm_model(x)    
        last_output = outputs[-1]         
        return last_output[:, self.target_index]
        
md = CaptumModel(final_model, target_index=1).to(DEVICE)
last_window_t.requires_grad_()
md.train()

# | --------------------------|
# | 8.2 Integrated Gradients  |
# | --------------------------|
ig = IntegratedGradients(md)
baseline = torch.zeros_like(last_window_t).to(DEVICE)

# | --------------------------|
# | 8.3 Compute attributions  |
# | --------------------------|
attributions, delta = ig.attribute(last_window_t, 
                                   baseline, 
                                   return_convergence_delta=True)
attr = attributions.squeeze(1).detach().cpu().numpy()
avg_attr_per_timestep = attr.mean(axis=1)

print("Average attributions per time step:\n", avg_attr_per_timestep)

# | --------------------------|
# | 8.4 Plot time-step        |
# | --------------------------|
plt.figure(figsize=(12, 6))
plt.plot(range(len(avg_attr_per_timestep)), avg_attr_per_timestep, marker='o')
plt.xlabel("Time Step")
plt.ylabel("Average Attribution")
plt.title("Integrated Gradients Attributions over the Last Window (Time Steps)")
plt.grid(True)
plt.show()
Average attributions per time step:
 [8.4618076e-11 9.7116891e-11 1.3332961e-10 1.6506448e-10 2.1382948e-10
 2.5399199e-10 3.3404243e-10 4.1872006e-10 5.5158889e-10 7.0463962e-10
 9.3649077e-10 1.2263686e-09 1.6076981e-09 1.9979736e-09 2.6368880e-09
 3.4501841e-09 4.6910014e-09 6.3431669e-09 8.2293745e-09 1.0982817e-08
 1.3993827e-08 2.0980094e-08 2.9321937e-08 3.3955605e-08]
No description has been provided for this image

9. Submission¶

In [15]:
temp = pd.read_csv("/kaggle/input/playing-the-stock-market-is-a-fools-game/sample_submission.csv")
temp["value"] = next_day_pred
temp.to_csv("submission.csv", index=False)
print("Done")
Done