class googleNet(nn.Module):
def __init__(self, latent_dim=512):
super(googleNet, self).__init__()
self.model = torch.hub.load('pytorch/vision:v0.10.0', 'googlenet', pretrained=True)
#freeze paramters (trains faster and keeps weight values of ImageNet)
for params in self.model.parameters():
params.requires_grad = False
#change last fully completerd layer
self.model.fc = nn.Linear(self.model.fc.in_features, latent_dim)
def forward(self, x):
output = self.model(x)
return output
class Lstm(nn.Module):
def __init__(self, latent_dim = 512, hidden_size = 256, lstm_layers = 2, bidirectional = True):
super(Lstm, self).__init__()
self.latent_dim = latent_dim
self.hidden_size = hidden_size
self.lstm_layers = lstm_layers
self.bidirectional = bidirectional
self.Lstm = nn.LSTM(self.latent_dim, hidden_size=self.hidden_size, num_layers=self.lstm_layers, batch_first=True, bidirectional=self.bidirectional)
self.hidden_state = None
def reset_hidden_state(self):
self.hidden_state = None
def forward(self,x):
output, self.hidden_state = self.Lstm(x, self.hidden_state)
return output
class ConvLstm(nn.Module):
def __init__(self, google, lstm, n_class = 10):
super(ConvLstm, self).__init__()
self.modela = google
self.modelb = lstm
self.output_layer = nn.Sequential(
nn.Linear(2 * self.modelb.hidden_size if self.modelb.bidirectional==True else self.modelb.hidden_size, n_class),
nn.Softmax(dim=-1)
)
def forward(self, x):
batch_size, timesteps, channel_x, h_x, w_x = x.shape
conv_input = x.view(batch_size * timesteps, channel_x, h_x, w_x)
conv_output = self.modela(conv_input)
lstm_input = conv_output.view(batch_size, timesteps, -1)
lstm_output = self.modelb(lstm_input)
lstm_output = lstm_output[:, -1, :]
output = self.output_layer(lstm_output)
return output
Above is the NN that I use and the following code is used to train it.
modela = googleNet()
modelb = Lstm()
modelc = ConvLstm(modela,modelb).to(device)
## Loss and optimizer
learning_rate = 5e-4 #I picked this because it seems to be the most used by experts
load_model = True
criterion = nn.CrossEntropyLoss()
optimizer = optim.Adam(modelc.parameters(), lr= learning_rate) #Adam seems to be the most popular for deep learning
modelc.train()
for epoch in range(100): #I decided to train the model for 100 epochs
loss_ep = 0
for batch_idx, (data, targets) in enumerate(zip(features_train, labels_train)):
data = data.to(device)
targets = targets.to(device)
## Forward Pass
optimizer.zero_grad()
modelc.modelb.reset_hidden_state()
scores = modelc(data)
loss = criterion(scores,targets)
loss.backward()
optimizer.step()
loss_ep += loss.item()
print(f"Loss in epoch {epoch} :::: {loss_ep/len(features_train)}")
with torch.no_grad():
num_correct = 0
num_samples = 0
The cross entropy through the 100 epochs is at 2.301. What is going wrong? I have read that the crossentropy includes softmax and I removed it from the output layer but the cross entropy still stays on the same value.
torch.nn.CrossEntropyLoss, we find "The input is expected to contain raw, unnormalized scores for each class," but you apply softmax as the final step in the output ofmodelc. The duplicate thread specifically names this mistake as a common programming bug. – Sycorax Jun 26 '22 at 17:59