GoogleNet-LSTM, cross entropy loss does not decrease

Question

class googleNet(nn.Module):
  def __init__(self, latent_dim=512):
    super(googleNet, self).__init__()
    self.model = torch.hub.load('pytorch/vision:v0.10.0', 'googlenet', pretrained=True)

    #freeze paramters (trains faster and keeps weight values of ImageNet)
    for params in self.model.parameters():
      params.requires_grad = False

    #change last fully completerd layer
    self.model.fc = nn.Linear(self.model.fc.in_features, latent_dim)

  def forward(self, x):
    output = self.model(x)
    return output

class Lstm(nn.Module):
  def __init__(self, latent_dim = 512, hidden_size = 256, lstm_layers = 2, bidirectional = True):
    super(Lstm, self).__init__()
    self.latent_dim = latent_dim
    self.hidden_size = hidden_size
    self.lstm_layers = lstm_layers
    self.bidirectional = bidirectional
    self.Lstm = nn.LSTM(self.latent_dim, hidden_size=self.hidden_size, num_layers=self.lstm_layers, batch_first=True, bidirectional=self.bidirectional)
    self.hidden_state = None

  def reset_hidden_state(self):
    self.hidden_state = None

  def forward(self,x):
    output, self.hidden_state = self.Lstm(x, self.hidden_state)
    return output

class ConvLstm(nn.Module):
    def __init__(self, google, lstm, n_class = 10):
        super(ConvLstm, self).__init__()
        self.modela = google
        self.modelb = lstm
        self.output_layer = nn.Sequential(
            nn.Linear(2 * self.modelb.hidden_size if self.modelb.bidirectional==True else self.modelb.hidden_size, n_class),
            nn.Softmax(dim=-1)
        )

    def forward(self, x):
        batch_size, timesteps, channel_x, h_x, w_x = x.shape
        conv_input = x.view(batch_size * timesteps, channel_x, h_x, w_x)
        conv_output = self.modela(conv_input)
        lstm_input = conv_output.view(batch_size, timesteps, -1)
        lstm_output = self.modelb(lstm_input)
        lstm_output = lstm_output[:, -1, :]
        output = self.output_layer(lstm_output)
        return output

Above is the NN that I use and the following code is used to train it.

modela = googleNet()
modelb = Lstm()
modelc = ConvLstm(modela,modelb).to(device)
## Loss and optimizer
learning_rate = 5e-4 #I picked this because it seems to be the most used by experts
load_model = True
criterion = nn.CrossEntropyLoss()
optimizer = optim.Adam(modelc.parameters(), lr= learning_rate) #Adam seems to be the most popular for deep learning
modelc.train()
for epoch in range(100): #I decided to train the model for 100 epochs
    loss_ep = 0
for batch_idx, (data, targets) in enumerate(zip(features_train, labels_train)):
    data = data.to(device)
    targets = targets.to(device)
    ## Forward Pass
    optimizer.zero_grad()
    modelc.modelb.reset_hidden_state()
    scores = modelc(data)
    loss = criterion(scores,targets)
    loss.backward()
    optimizer.step()
    loss_ep += loss.item()
print(f&quot;Loss in epoch {epoch} :::: {loss_ep/len(features_train)}&quot;)

with torch.no_grad():
    num_correct = 0
    num_samples = 0

The cross entropy through the 100 epochs is at 2.301. What is going wrong? I have read that the crossentropy includes softmax and I removed it from the output layer but the cross entropy still stays on the same value.

From the documentation of torch.nn.CrossEntropyLoss, we find "The input is expected to contain raw, unnormalized scores for each class," but you apply softmax as the final step in the output of modelc. The duplicate thread specifically names this mistake as a common programming bug. — Sycorax, Jun 26 '22 at 17:59
I have tried it by removing the softmax as well. When you say unormalised, you mean I shouldn't divide by 255 the pixels? — makala, Jun 26 '22 at 19:49
The documentation says "unnormalized scores," which means the network output (input to the loss) is expected to be unnormalized, in contrast to a probability (such as produced by softmax or a similar activation). Scaling the input to the network can make the optimization task easier, but is not related to the scale of the inputs to the loss function. — Sycorax, Jun 26 '22 at 20:11
Thank you, so normalising the pixels shouldnt matter but I will try using the original values and let you know. — makala, Jun 26 '22 at 20:20

GoogleNet-LSTM, cross entropy loss does not decrease

0 Answers0