# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load
import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory
import os
for dirname, _, filenames in os.walk('/kaggle/input'):
for filename in filenames:
print(os.path.join(dirname, filename))
# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All"
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session
/kaggle/input/playing-the-stock-market-is-a-fools-game/sample_submission.csv /kaggle/input/playing-the-stock-market-is-a-fools-game/train.csv
Preliminary¶
!pip install optuna
!pip install captum
Requirement already satisfied: optuna in /usr/local/lib/python3.10/dist-packages (4.2.1) Requirement already satisfied: alembic>=1.5.0 in /usr/local/lib/python3.10/dist-packages (from optuna) (1.14.1) Requirement already satisfied: colorlog in /usr/local/lib/python3.10/dist-packages (from optuna) (6.9.0) Requirement already satisfied: numpy in /usr/local/lib/python3.10/dist-packages (from optuna) (1.26.4) Requirement already satisfied: packaging>=20.0 in /usr/local/lib/python3.10/dist-packages (from optuna) (24.2) Requirement already satisfied: sqlalchemy>=1.4.2 in /usr/local/lib/python3.10/dist-packages (from optuna) (2.0.36) Requirement already satisfied: tqdm in /usr/local/lib/python3.10/dist-packages (from optuna) (4.67.1) Requirement already satisfied: PyYAML in /usr/local/lib/python3.10/dist-packages (from optuna) (6.0.2) Requirement already satisfied: Mako in /usr/local/lib/python3.10/dist-packages (from alembic>=1.5.0->optuna) (1.3.9) Requirement already satisfied: typing-extensions>=4 in /usr/local/lib/python3.10/dist-packages (from alembic>=1.5.0->optuna) (4.12.2) Requirement already satisfied: greenlet!=0.4.17 in /usr/local/lib/python3.10/dist-packages (from sqlalchemy>=1.4.2->optuna) (3.1.1) Requirement already satisfied: mkl_fft in /usr/local/lib/python3.10/dist-packages (from numpy->optuna) (1.3.8) Requirement already satisfied: mkl_random in /usr/local/lib/python3.10/dist-packages (from numpy->optuna) (1.2.4) Requirement already satisfied: mkl_umath in /usr/local/lib/python3.10/dist-packages (from numpy->optuna) (0.1.1) Requirement already satisfied: mkl in /usr/local/lib/python3.10/dist-packages (from numpy->optuna) (2025.0.1) Requirement already satisfied: tbb4py in /usr/local/lib/python3.10/dist-packages (from numpy->optuna) (2022.0.0) Requirement already satisfied: mkl-service in /usr/local/lib/python3.10/dist-packages (from numpy->optuna) (2.4.1) Requirement already satisfied: MarkupSafe>=0.9.2 in /usr/local/lib/python3.10/dist-packages (from Mako->alembic>=1.5.0->optuna) (3.0.2) Requirement already satisfied: intel-openmp>=2024 in /usr/local/lib/python3.10/dist-packages (from mkl->numpy->optuna) (2024.2.0) Requirement already satisfied: tbb==2022.* in /usr/local/lib/python3.10/dist-packages (from mkl->numpy->optuna) (2022.0.0) Requirement already satisfied: tcmlib==1.* in /usr/local/lib/python3.10/dist-packages (from tbb==2022.*->mkl->numpy->optuna) (1.2.0) Requirement already satisfied: intel-cmplr-lib-rt in /usr/local/lib/python3.10/dist-packages (from mkl_umath->numpy->optuna) (2024.2.0) Requirement already satisfied: intel-cmplr-lib-ur==2024.2.0 in /usr/local/lib/python3.10/dist-packages (from intel-openmp>=2024->mkl->numpy->optuna) (2024.2.0) Collecting captum Downloading captum-0.8.0-py3-none-any.whl.metadata (26 kB) Requirement already satisfied: matplotlib in /usr/local/lib/python3.10/dist-packages (from captum) (3.7.5) Requirement already satisfied: numpy<2.0 in /usr/local/lib/python3.10/dist-packages (from captum) (1.26.4) Requirement already satisfied: packaging in /usr/local/lib/python3.10/dist-packages (from captum) (24.2) Requirement already satisfied: torch>=1.10 in /usr/local/lib/python3.10/dist-packages (from captum) (2.5.1+cu121) Requirement already satisfied: tqdm in /usr/local/lib/python3.10/dist-packages (from captum) (4.67.1) Requirement already satisfied: mkl_fft in /usr/local/lib/python3.10/dist-packages (from numpy<2.0->captum) (1.3.8) Requirement already satisfied: mkl_random in /usr/local/lib/python3.10/dist-packages (from numpy<2.0->captum) (1.2.4) Requirement already satisfied: mkl_umath in /usr/local/lib/python3.10/dist-packages (from numpy<2.0->captum) (0.1.1) Requirement already satisfied: mkl in /usr/local/lib/python3.10/dist-packages (from numpy<2.0->captum) (2025.0.1) Requirement already satisfied: tbb4py in /usr/local/lib/python3.10/dist-packages (from numpy<2.0->captum) (2022.0.0) Requirement already satisfied: mkl-service in /usr/local/lib/python3.10/dist-packages (from numpy<2.0->captum) (2.4.1) Requirement already satisfied: filelock in /usr/local/lib/python3.10/dist-packages (from torch>=1.10->captum) (3.17.0) Requirement already satisfied: typing-extensions>=4.8.0 in /usr/local/lib/python3.10/dist-packages (from torch>=1.10->captum) (4.12.2) Requirement already satisfied: networkx in /usr/local/lib/python3.10/dist-packages (from torch>=1.10->captum) (3.4.2) Requirement already satisfied: jinja2 in /usr/local/lib/python3.10/dist-packages (from torch>=1.10->captum) (3.1.4) Requirement already satisfied: fsspec in /usr/local/lib/python3.10/dist-packages (from torch>=1.10->captum) (2024.12.0) Requirement already satisfied: sympy==1.13.1 in /usr/local/lib/python3.10/dist-packages (from torch>=1.10->captum) (1.13.1) Requirement already satisfied: mpmath<1.4,>=1.1.0 in /usr/local/lib/python3.10/dist-packages (from sympy==1.13.1->torch>=1.10->captum) (1.3.0) Requirement already satisfied: contourpy>=1.0.1 in /usr/local/lib/python3.10/dist-packages (from matplotlib->captum) (1.3.1) Requirement already satisfied: cycler>=0.10 in /usr/local/lib/python3.10/dist-packages (from matplotlib->captum) (0.12.1) Requirement already satisfied: fonttools>=4.22.0 in /usr/local/lib/python3.10/dist-packages (from matplotlib->captum) (4.55.3) Requirement already satisfied: kiwisolver>=1.0.1 in /usr/local/lib/python3.10/dist-packages (from matplotlib->captum) (1.4.7) Requirement already satisfied: pillow>=6.2.0 in /usr/local/lib/python3.10/dist-packages (from matplotlib->captum) (11.0.0) Requirement already satisfied: pyparsing>=2.3.1 in /usr/local/lib/python3.10/dist-packages (from matplotlib->captum) (3.2.0) Requirement already satisfied: python-dateutil>=2.7 in /usr/local/lib/python3.10/dist-packages (from matplotlib->captum) (2.9.0.post0) Requirement already satisfied: six>=1.5 in /usr/local/lib/python3.10/dist-packages (from python-dateutil>=2.7->matplotlib->captum) (1.17.0) Requirement already satisfied: MarkupSafe>=2.0 in /usr/local/lib/python3.10/dist-packages (from jinja2->torch>=1.10->captum) (3.0.2) Requirement already satisfied: intel-openmp>=2024 in /usr/local/lib/python3.10/dist-packages (from mkl->numpy<2.0->captum) (2024.2.0) Requirement already satisfied: tbb==2022.* in /usr/local/lib/python3.10/dist-packages (from mkl->numpy<2.0->captum) (2022.0.0) Requirement already satisfied: tcmlib==1.* in /usr/local/lib/python3.10/dist-packages (from tbb==2022.*->mkl->numpy<2.0->captum) (1.2.0) Requirement already satisfied: intel-cmplr-lib-rt in /usr/local/lib/python3.10/dist-packages (from mkl_umath->numpy<2.0->captum) (2024.2.0) Requirement already satisfied: intel-cmplr-lib-ur==2024.2.0 in /usr/local/lib/python3.10/dist-packages (from intel-openmp>=2024->mkl->numpy<2.0->captum) (2024.2.0) Downloading captum-0.8.0-py3-none-any.whl (1.4 MB) ━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━ 1.4/1.4 MB 35.1 MB/s eta 0:00:0000:01 Installing collected packages: captum Successfully installed captum-0.8.0
Library¶
import matplotlib.pyplot as plt
%matplotlib inline
from sklearn.preprocessing import MinMaxScaler
import torch
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import Dataset, DataLoader
from torch.nn.utils import clip_grad_norm_ as clip
import optuna
from optuna.visualization import plot_param_importances
from captum.attr import IntegratedGradients
DEVICE = torch.device("cuda" if torch.cuda.is_available() else "cpu")
1. Preprocess¶
Preprocessing is a crucial stage as it helps us to understanding the dataset. This section also implement some steps such as explore, clean, and transform the data so that we have a further analysis. First, we import the given data from kaggle competition, and then format data with rows stand for "Date" and columns as "Companies". We also check the missing values and summary first 10 companies.
The chart below displays the daily returns for 442 companies from 05/04/2010 to 31/03/2022, indicating that most fluctuate within a band of approximately -20% to +20%. Such bounded variability suggests these returns are likely stationary, with no clear trend over time. We also observe a negative shock in the first quater of 2020, implying COVID-19 pandemic effect.
In this project, my approach is using MinMaxScaler for normalization to mitigate the effect of extreme values. This is because data still have the true zero values. Afterwards, I split the dataset into out-of-sample (training set) and in-sample (test set) using rolling window. The ratio is 80% for training and 20% for testing, sparated by a threshold day.
# | --------------------------|
# | 1.1 Import data |
# | --------------------------|
train = pd.read_csv('/kaggle/input/playing-the-stock-market-is-a-fools-game/train.csv',
index_col='ID')
print('Overview:\n'+'-'*30)
print(f' train.shape: \t{train.shape}')
print(f' Total companies: \t{len(train.index.unique())}')
print('='*30)
# | --------------------------|
# | 1.2 Format data |
# | --------------------------|
train = train.T
train.index.name = 'Date'
train.index = pd.to_datetime(train.index, format = '%d/%m/%Y')
print('\nTotal NAs:\n'+'-'*30+f'\n{train.isna().sum()}')
print('='*30)
# | --------------------------|
# | 1.3 Summary data |
# | --------------------------|
summary_stats = train.describe()
print("\nSummary Statistics for First 10 Companies\n"+'-'*70)
print(summary_stats.iloc[:, :10])
print('='*70)
# | --------------------------|
# | 1.4 Visualization |
# | --------------------------|
plt.figure(figsize=(15, 8))
for company in train.columns:
plt.plot(train.index, train[company], alpha=0.5)
plt.xlabel("Date")
plt.ylabel("Daily percentage change")
plt.title("Return of 442 companies from 05/04/2010 to 31/03/2022")
plt.tight_layout()
plt.show()
# | --------------------------|
# | 1.5 Normalization |
# | --------------------------|
scaler = MinMaxScaler()
data = scaler.fit_transform(train.values)
# | --------------------------|
# | 1.6 Split ratio |
# | --------------------------|
thres_date = int(len(data) * 0.8)
train_set = data[:thres_date, :]
test_set = data[thres_date:, :]
print("\nData includes:\n"+'-'*30)
print(f'Train shape: \t{train_set.shape}')
print(f'Test shape: \t{test_set.shape}')
print('='*30)
Overview: ------------------------------ train.shape: (442, 3021) Total companies: 442 ============================== Total NAs: ------------------------------ ID company_0 0 company_1 0 company_2 0 company_3 0 company_4 0 .. company_437 0 company_438 0 company_439 0 company_440 0 company_441 0 Length: 442, dtype: int64 ============================== Summary Statistics for First 10 Companies ---------------------------------------------------------------------- ID company_0 company_1 company_2 company_3 company_4 \ count 3021.000000 3021.000000 3021.000000 3021.000000 3021.000000 mean 0.021781 -0.019361 0.029437 0.048183 0.043363 std 1.695075 3.130242 1.755380 1.984768 1.853193 min -19.350000 -43.450000 -15.030000 -19.270000 -25.860000 25% -0.660000 -1.360000 -0.780000 -0.830000 -0.800000 50% 0.050000 0.030000 0.080000 0.110000 0.060000 75% 0.800000 1.400000 0.900000 1.000000 0.940000 max 13.760000 20.450000 15.370000 15.050000 17.410000 ID company_5 company_6 company_7 company_8 company_9 count 3021.000000 3021.000000 3021.000000 3021.000000 3021.000000 mean -0.025650 -0.039649 0.032939 -0.034052 0.014386 std 2.638759 3.042905 2.122935 3.312811 2.277523 min -41.380000 -59.740000 -19.130000 -88.150000 -31.320000 25% -1.060000 -1.360000 -0.920000 -1.390000 -0.880000 50% 0.080000 0.000000 0.080000 0.030000 0.070000 75% 1.120000 1.360000 1.110000 1.450000 1.000000 max 22.890000 17.400000 13.460000 18.860000 19.560000 ======================================================================
Data includes: ------------------------------ Train shape: (2416, 442) Test shape: (605, 442) ==============================
2. Data loader¶
I created rolling windows function, adding to the function create time series with form of tensors. Early stopping is used during training to prevent overfitting and reduce computation time.
# | --------------------------|
# | 2.1 Rolling windows |
# | --------------------------|
def rolling_windows(data, seq_length):
X, y = [], []
for i in range(len(data) - seq_length):
X.append(data[i : (i + seq_length)])
y.append(data[i + seq_length])
return np.array(X), np.array(y)
# | --------------------------|
# | 2.2 Custom dataset |
# | --------------------------|
class TimeSeries(Dataset):
def __init__(self, data, seq_length):
self.X, self.y = rolling_windows(data, seq_length)
def __len__(self):
return len(self.X)
def __getitem__(self, idx):
return (torch.tensor(self.X[idx], dtype=torch.float32),
torch.tensor(self.y[idx], dtype=torch.float32))
# | --------------------------|
# | 2.3 Early stopping |
# | --------------------------|
class EarlyStopping:
def __init__(self, patience=20, min_delta=1e-4):
self.patience = patience
self.min_delta = min_delta
self.best_loss = None
self.counter = 0
def step(self, current_loss):
if self.best_loss is None or current_loss < (self.best_loss - self.min_delta):
self.best_loss = current_loss
self.counter = 0
return False # Don't stop
else:
self.counter += 1
return self.counter >= self.patience
3. Model architecture¶
That is my final LSTM model. The model will be the best parameters from tuning hyperparameters of Optuna. I will decribe in the next part. Basically, it's include simple LSTM with some layers and hidden dimensions. I also added dropout layer for deep LSTM (layers > 1). If LSTM with layer = 1, dropout ratio will be 0 thus we do not need to care about dropout at this case. I will fixed some hyperparameters I believe that is the best parameters (as commented the trial).
After tuning the hyperparameters, I found that the LSTM model with 2 layers performed better than other configurations. Therefore, I chose to use 2 layers in the final model setup.
def my_LSTM(best_params, input_dim, output_dim):
num_layers = 2 # best_params["num_layers"]
hidden_dim = best_params["hidden_dim"]
dropout = best_params["dropout"]
class LSTM(nn.Module):
def __init__(self, input_dim, hidden_dim, output_dim, num_layers, dropout):
super(LSTM, self).__init__()
self.input_dim = input_dim
self.hidden_dim = hidden_dim
self.num_layers = num_layers
self.rnn = nn.LSTM(
input_size=input_dim,
hidden_size=hidden_dim,
num_layers=num_layers,
batch_first=False,
dropout = dropout if num_layers > 1 else 0.0
)
self.linear = nn.Linear(hidden_dim, output_dim)
def init_hidden(self, batch_size, device):
return (torch.zeros(self.num_layers,
batch_size,
self.hidden_dim,
device = device),
torch.zeros(self.num_layers,
batch_size,
self.hidden_dim,
device = device))
def forward(self, x):
batch_size = x.size(1)
hidden = self.init_hidden(batch_size, x.device)
y, hidden = self.rnn(x, hidden)
y = self.linear(y)
return y, hidden
model = LSTM(input_dim, hidden_dim, output_dim, num_layers, dropout)
return model
4. Hyperparameters tuning with Optuna¶
About structure of model, some hyperparameters will be tuned as defined:
- Number of layers
- Number of features in hidden state
- Dropout layer
In terms of objective function and window size:
- Sequence length
- Weigh decay (L2)
- Gradient clipping
Some fixed parameters I used:
- Batch size (Do not shuffle because of time series chracteristic)
- Early stop
- Epoch
- Optimizer is Adam
I also add Gaussian noise ~ N(0, 0.0001) into training step.
def define_lstm_model(trial, input_dim, output_dim):
num_layers = 2 # trial.suggest_int("num_layers", 1, 3)
hidden_dim = trial.suggest_int("hidden_dim", 300, 350)
dropout = trial.suggest_float("dropout", 0.15, 0.18)
class LSTM(nn.Module):
def __init__(self, input_dim, hidden_dim, output_dim, num_layers, dropout):
super(LSTM, self).__init__()
self.rnn = nn.LSTM(
input_size=input_dim,
hidden_size=hidden_dim,
num_layers=num_layers,
batch_first=False,
dropout = dropout if num_layers > 1 else 0.0
)
self.linear = nn.Linear(hidden_dim, output_dim)
def forward(self, x):
out, _ = self.rnn(x)
out = self.linear(out)
return out
return LSTM(input_dim, hidden_dim, output_dim, num_layers, dropout)
def objective(trial):
# | --------------------------|
# | 4.1 Hyperparameters |
# | --------------------------|
seq_length = 24 # trial.suggest_int("seq_length", 23, 31)
weight_decay = trial.suggest_float("weight_decay", 1e-6, 1e-4, log=True)
lr = trial.suggest_float("lr", 1e-5, 1e-3, log=True)
clip_value = trial.suggest_float("clip_value", 1.5, 2.0, log=True)
batch_size = 64
early_stopper = EarlyStopping(patience=20, min_delta=1e-4)
# | --------------------------|
# | 4.2 Define the model |
# | --------------------------|
input_dim = train_set.shape[1] # e.g. 432 companies
output_dim = train_set.shape[1]
model = define_lstm_model(trial, input_dim, output_dim).to(DEVICE)
optimizer = optim.Adam(model.parameters(), lr=lr, weight_decay=weight_decay)
criterion = nn.MSELoss()
EPOCHS = 100
# | --------------------------|
# | 4.3 Window for train |
# | --------------------------|
train_dataset = TimeSeries(train_set, seq_length)
train_loader = DataLoader(train_dataset, batch_size=batch_size, shuffle=False)
# | --------------------------|
# | 4.4 Training loops |
# | --------------------------|
for epoch in range(EPOCHS):
model.train()
epoch_loss = 0.0
for X_batch, y_batch in train_loader:
# X_batch: (batch_size, seq_length, input_dim)
# Transpose to (seq_length, batch_size, input_dim)
X_batch = X_batch.transpose(0, 1).to(DEVICE)
y_batch = y_batch.to(DEVICE)
optimizer.zero_grad()
# Add white noise epsilon ~ N(0,0.0001)
X_noisy = X_batch + torch.randn_like(X_batch) * 0.01
outputs = model(X_noisy)
# Compare final time step to y
loss = criterion(outputs[-1], y_batch)
loss.backward()
# Gradient clipping
clip(model.parameters(), clip_value)
optimizer.step()
epoch_loss += loss.item()
avg_loss = epoch_loss / len(train_loader)
trial.report(avg_loss, epoch)
if early_stopper.step(avg_loss):
print(f"Early stopping at epoch {epoch+1}")
break
if trial.should_prune():
raise optuna.exceptions.TrialPruned()
# | --------------------------|
# | 4.5 Evaluate on test set |
# | --------------------------|
test_dataset = TimeSeries(test_set, seq_length)
test_loader = DataLoader(test_dataset, batch_size=batch_size, shuffle=False)
model.eval()
total_loss = 0.0
with torch.no_grad():
for X_batch, y_batch in test_loader:
X_batch = X_batch.transpose(0, 1).to(DEVICE)
y_batch = y_batch.to(DEVICE)
outputs = model(X_batch)
loss = criterion(outputs[-1], y_batch)
total_loss += loss.item()
avg_test_loss = total_loss / len(test_loader)
return avg_test_loss
I used many trials because of tuning various hyperparameters. The early stopper will save computional time if needed.
study = optuna.create_study(direction='minimize')
study.optimize(objective, n_trials=50, timeout=600)
print("Number of finished trials:", len(study.trials))
print("Best trial:", study.best_trial.params)
print("Best value (MSE):", study.best_trial.value)
[I 2025-04-11 14:00:30,504] A new study created in memory with name: no-name-5c359c16-4e72-4fa2-8b1c-da00e9e45394 [I 2025-04-11 14:00:50,465] Trial 0 finished with value: 0.005437457200605423 and parameters: {'weight_decay': 9.010696366541017e-05, 'lr': 3.2631921523822867e-05, 'clip_value': 1.866483325115156, 'hidden_dim': 329, 'dropout': 0.1754962395750958}. Best is trial 0 with value: 0.005437457200605423.
Early stopping at epoch 42
[I 2025-04-11 14:01:02,646] Trial 1 finished with value: 0.00541713695274666 and parameters: {'weight_decay': 1.6820759117508032e-06, 'lr': 0.00033567838742119805, 'clip_value': 1.8276086428463114, 'hidden_dim': 348, 'dropout': 0.15032483610439398}. Best is trial 1 with value: 0.00541713695274666.
Early stopping at epoch 30
[I 2025-04-11 14:01:15,396] Trial 2 finished with value: 0.005414817866403609 and parameters: {'weight_decay': 1.913899886289534e-06, 'lr': 7.877108245309796e-05, 'clip_value': 1.8749800376757997, 'hidden_dim': 311, 'dropout': 0.1556428663624871}. Best is trial 2 with value: 0.005414817866403609.
Early stopping at epoch 33
[I 2025-04-11 14:01:28,776] Trial 3 finished with value: 0.005433881666976959 and parameters: {'weight_decay': 3.0946875752567993e-06, 'lr': 1.1342870077167936e-05, 'clip_value': 1.8135418545573978, 'hidden_dim': 303, 'dropout': 0.16711117775015494}. Best is trial 2 with value: 0.005414817866403609.
Early stopping at epoch 35
[I 2025-04-11 14:01:40,502] Trial 4 finished with value: 0.005442619521636516 and parameters: {'weight_decay': 1.006123553176289e-05, 'lr': 0.0009483162432304677, 'clip_value': 1.6389921555239382, 'hidden_dim': 329, 'dropout': 0.16492601675200816}. Best is trial 2 with value: 0.005414817866403609.
Early stopping at epoch 29
[I 2025-04-11 14:01:40,958] Trial 5 pruned. [I 2025-04-11 14:01:51,021] Trial 6 finished with value: 0.005460354976821691 and parameters: {'weight_decay': 8.382687383884459e-05, 'lr': 0.0004026499915548599, 'clip_value': 1.715257939513186, 'hidden_dim': 309, 'dropout': 0.17611276292926648}. Best is trial 2 with value: 0.005414817866403609.
Early stopping at epoch 26
[I 2025-04-11 14:01:54,606] Trial 7 pruned. [I 2025-04-11 14:01:55,045] Trial 8 pruned. [I 2025-04-11 14:01:55,494] Trial 9 pruned. [I 2025-04-11 14:01:57,190] Trial 10 pruned. [I 2025-04-11 14:02:09,374] Trial 11 finished with value: 0.0054453269694931805 and parameters: {'weight_decay': 3.0016444449495126e-05, 'lr': 0.00030132863816897135, 'clip_value': 1.9587820715261755, 'hidden_dim': 348, 'dropout': 0.1503370159585747}. Best is trial 2 with value: 0.005414817866403609.
Early stopping at epoch 30
[I 2025-04-11 14:02:09,861] Trial 12 pruned. [I 2025-04-11 14:02:21,940] Trial 13 finished with value: 0.005440033378545195 and parameters: {'weight_decay': 1.8626673272768084e-05, 'lr': 0.0007836118097393293, 'clip_value': 1.7940476517086086, 'hidden_dim': 315, 'dropout': 0.16100015593572392}. Best is trial 2 with value: 0.005414817866403609.
Early stopping at epoch 31
[I 2025-04-11 14:02:22,438] Trial 14 pruned. [I 2025-04-11 14:02:22,913] Trial 15 pruned. [I 2025-04-11 14:02:23,411] Trial 16 pruned. [I 2025-04-11 14:02:24,647] Trial 17 pruned. [I 2025-04-11 14:02:25,136] Trial 18 pruned. [I 2025-04-11 14:02:25,627] Trial 19 pruned. [I 2025-04-11 14:02:26,082] Trial 20 pruned. [I 2025-04-11 14:02:26,536] Trial 21 pruned. [I 2025-04-11 14:02:27,001] Trial 22 pruned. [I 2025-04-11 14:02:27,472] Trial 23 pruned. [I 2025-04-11 14:02:27,956] Trial 24 pruned. [I 2025-04-11 14:02:28,449] Trial 25 pruned. [I 2025-04-11 14:02:28,916] Trial 26 pruned. [I 2025-04-11 14:02:40,335] Trial 27 finished with value: 0.0054313832661136985 and parameters: {'weight_decay': 4.012516584798842e-06, 'lr': 0.0006195392971702806, 'clip_value': 1.6569226926301694, 'hidden_dim': 303, 'dropout': 0.1527784251870268}. Best is trial 2 with value: 0.005414817866403609.
Early stopping at epoch 30
[I 2025-04-11 14:02:52,435] Trial 28 finished with value: 0.005447977152653038 and parameters: {'weight_decay': 4.424736427518117e-06, 'lr': 0.0006145175066730719, 'clip_value': 1.6563144546147344, 'hidden_dim': 313, 'dropout': 0.15372055261308065}. Best is trial 2 with value: 0.005414817866403609.
Early stopping at epoch 31
[I 2025-04-11 14:02:53,678] Trial 29 pruned. [I 2025-04-11 14:02:54,170] Trial 30 pruned. [I 2025-04-11 14:03:05,644] Trial 31 finished with value: 0.005459379591047764 and parameters: {'weight_decay': 7.437705227276176e-06, 'lr': 0.0006314143097492449, 'clip_value': 1.6087132706826734, 'hidden_dim': 303, 'dropout': 0.15503312609732145}. Best is trial 2 with value: 0.005414817866403609.
Early stopping at epoch 30
[I 2025-04-11 14:03:17,516] Trial 32 finished with value: 0.00545856540556997 and parameters: {'weight_decay': 3.65541440057519e-06, 'lr': 0.0008642340947939282, 'clip_value': 1.736418574215152, 'hidden_dim': 308, 'dropout': 0.16717370398748027}. Best is trial 2 with value: 0.005414817866403609.
Early stopping at epoch 31
[I 2025-04-11 14:03:18,000] Trial 33 pruned. [I 2025-04-11 14:03:18,462] Trial 34 pruned. [I 2025-04-11 14:03:30,108] Trial 35 finished with value: 0.00546645465074107 and parameters: {'weight_decay': 1.8229481749002745e-06, 'lr': 0.000997039294911953, 'clip_value': 1.7338512746237096, 'hidden_dim': 309, 'dropout': 0.17278615138964415}. Best is trial 2 with value: 0.005414817866403609.
Early stopping at epoch 30
[I 2025-04-11 14:03:30,617] Trial 36 pruned. [I 2025-04-11 14:03:31,083] Trial 37 pruned. [I 2025-04-11 14:03:31,548] Trial 38 pruned. [I 2025-04-11 14:03:32,032] Trial 39 pruned. [I 2025-04-11 14:03:32,503] Trial 40 pruned. [I 2025-04-11 14:03:33,026] Trial 41 pruned. [I 2025-04-11 14:03:33,535] Trial 42 pruned. [I 2025-04-11 14:03:34,024] Trial 43 pruned. [I 2025-04-11 14:03:34,498] Trial 44 pruned. [I 2025-04-11 14:03:34,962] Trial 45 pruned. [I 2025-04-11 14:03:35,432] Trial 46 pruned. [I 2025-04-11 14:03:35,935] Trial 47 pruned. [I 2025-04-11 14:03:36,430] Trial 48 pruned. [I 2025-04-11 14:03:44,356] Trial 49 pruned.
Number of finished trials: 50 Best trial: {'weight_decay': 1.913899886289534e-06, 'lr': 7.877108245309796e-05, 'clip_value': 1.8749800376757997, 'hidden_dim': 311, 'dropout': 0.1556428663624871} Best value (MSE): 0.005414817866403609
This step will report the important weight of each hyperparameters. Top hyperparameters in the charts will give me motivation to narrow down the range of hyperparameters. This is because the chart shows the degree of hyperparameters affect most our goal.
fig = plot_param_importances(study)
fig.show()
5. Training phrase¶
Using the best parameters from Optuna, we training data.
# | --------------------------|
# | 5.1 Define hyperparameters|
# | --------------------------|
best_params = study.best_trial.params
seq_length = 24 # best_params["seq_length"]
input_dim = train_set.shape[1]
output_dim = train_set.shape[1]
# Build final model using my_LSTM
final_model = my_LSTM(best_params, input_dim, output_dim).to(DEVICE)
lr = best_params["lr"]
weight_decay = best_params["weight_decay"]
clip_value = best_params["clip_value"]
final_optimizer = optim.Adam(final_model.parameters(), lr=lr, weight_decay=weight_decay)
criterion = nn.MSELoss()
EPOCHS_FINAL = 1000
batch_size = 64
early_stopper_final = EarlyStopping(patience=20, min_delta=1e-4)
# | --------------------------|
# | 5.2 Full training data |
# | --------------------------|
train_dataset_full = TimeSeries(train_set, seq_length)
train_loader_full = DataLoader(train_dataset_full, batch_size=batch_size, shuffle=False)
print(f"Final Training Dataset: {len(train_dataset_full)} samples")
# | --------------------------|
# | 5.3 Training loops |
# | --------------------------|
for epoch in range(EPOCHS_FINAL):
final_model.train()
epoch_loss = 0.0
for X_batch, y_batch in train_loader_full:
X_batch = X_batch.transpose(0, 1).to(DEVICE)
y_batch = y_batch.to(DEVICE)
final_optimizer.zero_grad()
X_noisy = X_batch + torch.randn_like(X_batch) * 0.01
outputs, _ = final_model(X_noisy)
loss = criterion(outputs[-1], y_batch)
loss.backward()
# Gradient clipping
clip(final_model.parameters(), clip_value)
final_optimizer.step()
epoch_loss += loss.item()
avg_loss = epoch_loss / len(train_loader_full)
if (epoch + 1) % 100 == 0:
print(f"Epoch {epoch+1}/{EPOCHS_FINAL} => Loss: {avg_loss:.6f}")
if early_stopper_final.step(avg_loss):
print(f"Early stopping at epoch {epoch+1}")
print(f"Epoch {epoch+1}/{EPOCHS_FINAL} => Loss: {avg_loss:.6f}")
break
Final Training Dataset: 2392 samples Early stopping at epoch 34 Epoch 34/1000 => Loss: 0.002531
6. Evaluate on test set¶
Using the trained model, we evaluated on test set.
# | --------------------------|
# | 6.1 Full test data |
# | --------------------------|
test_dataset_final = TimeSeries(test_set, seq_length)
test_loader_final = DataLoader(test_dataset_final, batch_size=batch_size, shuffle=False)
# | --------------------------|
# | 6.2 Evaluate on test set |
# | --------------------------|
final_model.eval()
total_test_loss = 0.0
with torch.no_grad():
for X_batch, y_batch in test_loader_final:
X_batch = X_batch.transpose(0, 1).to(DEVICE)
y_batch = y_batch.to(DEVICE)
y_test_pred, _ = final_model(X_batch)
loss = criterion(y_test_pred[-1], y_batch)
total_test_loss += loss.item()
avg_test_loss = total_test_loss / len(test_loader_final)
print("Final Test MSE:", avg_test_loss)
Final Test MSE: 0.005414343986194581
7. Forecasting¶
# | --------------------------|
# | 7.1 Define last sequence |
# | --------------------------|
last_window = test_set[-seq_length:]
print("Last window shape:", last_window.shape)
last_window_t = torch.tensor(last_window, dtype=torch.float32).unsqueeze(1).to(DEVICE)
final_model.eval()
with torch.no_grad():
out_seq, _ = final_model(last_window_t)
next_day_pred_norm = out_seq[-1, 0].cpu().numpy()
# | --------------------------|
# | 7.2 Inverse to real values|
# | --------------------------|
next_day_pred = scaler.inverse_transform(next_day_pred_norm.reshape(1, -1)).flatten()
Last window shape: (24, 442)
8. Interpretation with Captum¶
This chart illustrates the final time step’s output from the LSTM, and then applies Integrated Gradients to assess each feature’s contribution relative to a zero baseline. It uses the last window of data to identify which lags most influenced the forecast. The accompanying chart displays the average integrated gradients attribution for each time step in the final window. Negative values suggest that the corresponding sequence times adversely affect the predicted value, while positive values indicate a same sign contribution.
# | --------------------------|
# | 8.1 Define forward pass |
# | --------------------------|
class CaptumModel(nn.Module):
def __init__(self, lstm_model, target_index=1):
super().__init__()
self.lstm_model = lstm_model
self.target_index = target_index
def forward(self, x):
outputs, _ = self.lstm_model(x)
last_output = outputs[-1]
return last_output[:, self.target_index]
md = CaptumModel(final_model, target_index=1).to(DEVICE)
last_window_t.requires_grad_()
md.train()
# | --------------------------|
# | 8.2 Integrated Gradients |
# | --------------------------|
ig = IntegratedGradients(md)
baseline = torch.zeros_like(last_window_t).to(DEVICE)
# | --------------------------|
# | 8.3 Compute attributions |
# | --------------------------|
attributions, delta = ig.attribute(last_window_t,
baseline,
return_convergence_delta=True)
attr = attributions.squeeze(1).detach().cpu().numpy()
avg_attr_per_timestep = attr.mean(axis=1)
print("Average attributions per time step:\n", avg_attr_per_timestep)
# | --------------------------|
# | 8.4 Plot time-step |
# | --------------------------|
plt.figure(figsize=(12, 6))
plt.plot(range(len(avg_attr_per_timestep)), avg_attr_per_timestep, marker='o')
plt.xlabel("Time Step")
plt.ylabel("Average Attribution")
plt.title("Integrated Gradients Attributions over the Last Window (Time Steps)")
plt.grid(True)
plt.show()
Average attributions per time step: [8.4618076e-11 9.7116891e-11 1.3332961e-10 1.6506448e-10 2.1382948e-10 2.5399199e-10 3.3404243e-10 4.1872006e-10 5.5158889e-10 7.0463962e-10 9.3649077e-10 1.2263686e-09 1.6076981e-09 1.9979736e-09 2.6368880e-09 3.4501841e-09 4.6910014e-09 6.3431669e-09 8.2293745e-09 1.0982817e-08 1.3993827e-08 2.0980094e-08 2.9321937e-08 3.3955605e-08]
9. Submission¶
temp = pd.read_csv("/kaggle/input/playing-the-stock-market-is-a-fools-game/sample_submission.csv")
temp["value"] = next_day_pred
temp.to_csv("submission.csv", index=False)
print("Done")
Done