Training loss and validation loss is nan through out using pytorch

Question

I have a cascaded neural network to perform binary classification. The first network is pertained so I simply initialise the model using its pretrained weights. The output of that model is then given to another fully connected neural network. Because I have a million samples, I use data generator to load the data on cpu and run the model on gpu. However, when I run the model my training loss and validation loss is nan through out. I have changed the learning rate and clipped gradient as well but nothing seems to work. My model is as follows:

use_cuda = torch.cuda.is_available()
device = torch.device("cuda:0" if use_cuda else "cpu")
torch.backends.cudnn.benchmark = True


class LambdaBase(nn.Sequential):
    def __init__(self, fn, *args):
        super(LambdaBase, self).__init__(*args)
        self.lambda_func = fn

    def forward_prepare(self, input):
        output = []
        for module in self._modules.values():
            output.append(module(input))
        return output if output else input

class Lambda(LambdaBase):
    def forward(self, input):
        return self.lambda_func(self.forward_prepare(input))

class LambdaMap(LambdaBase):
    def forward(self, input):
        return list(map(self.lambda_func,self.forward_prepare(input)))

class LambdaReduce(LambdaBase):
    def forward(self, input):
        return reduce(self.lambda_func,self.forward_prepare(input))

def get_basset_model(load_weights = True):
    pretrained_model_reloaded_th = nn.Sequential( # Sequential,
        nn.Conv2d(4,300,(19, 1)),
        nn.BatchNorm2d(300),
        nn.ReLU(),
        nn.MaxPool2d((3, 1),(3, 1)),
        nn.Conv2d(300,200,(11, 1)),
        nn.BatchNorm2d(200),
        nn.ReLU(),
        nn.MaxPool2d((4, 1),(4, 1)),
        nn.Conv2d(200,200,(7, 1)),
        nn.BatchNorm2d(200),
        nn.ReLU(),
        nn.MaxPool2d((4, 1),(4, 1)),
        Lambda(lambda x: x.view(x.size(0),-1)), # Reshape,
        nn.Sequential(Lambda(lambda x: x.view(1,-1) if 1==len(x.size()) else x ),nn.Linear(2000,1000)), # Linear,
        nn.BatchNorm1d(1000,1e-05,0.1,True),#BatchNorm1d,
        nn.ReLU(),
        nn.Dropout(0.3),
        nn.Sequential(Lambda(lambda x: x.view(1,-1) if 1==len(x.size()) else x ),nn.Linear(1000,1000)), # Linear,
        nn.BatchNorm1d(1000,1e-05,0.1,True),#BatchNorm1d,
        nn.ReLU(),
        nn.Dropout(0.3),
        nn.Sequential(Lambda(lambda x: x.view(1,-1) if 1==len(x.size()) else x ),nn.Linear(1000,164)), # Linear,
        nn.Sigmoid(),
    )
    if load_weights:
        sd = torch.load('pretrained.pth')
        pretrained_model_reloaded_th.load_state_dict(sd)
    return  pretrained_model_reloaded_th

def next_model_architecture():
    next_model = nn.Sequential(
    nn.Linear(164, 64),
    nn.ReLU(),
    nn.Linear(64, 1),
    nn.Sigmoid())
    
    return next_model

def cascading_model(basset_model,next_model):
    network = nn.Sequential(basset_model, next_model)
    return network

if __name__ == "__main__":
    
    ###model_path = 'saved_model.pth'
    model_path, epochs = sys.argv[1:]
    
    # Parameters
    params = {'batch_size': 500,
              'shuffle': True,
              'num_workers': 18}
    
    max_epochs = int(epochs)
    min_valid_loss = np.inf
    
    # Datasets
    with open('X.json', 'r') as fp:
        partition = json.load(fp)


    with open('Y.json', 'r') as fp:
        labels = json.load(fp) 
    
    # Generators
    training_set = DataGenerator(partition['train'], labels)
    training_generator = torch.utils.data.DataLoader(training_set, **params)

    validation_set = DataGenerator(partition['valid'], labels)
    validation_generator = torch.utils.data.DataLoader(validation_set, **params)
    
    basset_model = get_basset_model(load_weights = True)
    if torch.cuda.is_available():
        basset_model.cuda()
    next_model = next_model_architecture()
    if torch.cuda.is_available():
        next_model.cuda()
    
    network = cascading_model(basset_model,next_model)
    if torch.cuda.is_available():
        network.cuda()
    criterion = nn.BCEWithLogitsLoss()
    optimizer = torch.optim.Adam(network.parameters(), lr = 0.0001)
    
    for e in range(3):
        train_loss = 0.0
        for data, labels in (training_generator):
            #labels = labels.
            data, labels = data.to(device,dtype=torch.float), labels.to(device)
            
            # Clear the gradients
            optimizer.zero_grad()
            # Forward Pass
            target = network(data)
            # Find the Loss
            
            labels = labels.float()
            loss = criterion(target.squeeze(1),labels)
            # Calculate gradients 
            loss.backward()
            
            torch.nn.utils.clip_grad_norm_(network.parameters(), 5)
            
            # Update Weights
            optimizer.step()
            # Calculate Loss
            train_loss += loss.item()

        valid_loss = 0.0
        for data, labels in validation_generator:
            if torch.cuda.is_available():
                data, labels = data.to(device,dtype=torch.float), labels.to(device)

            target = network(data)
            labels = labels.float()
            loss = criterion(target.squeeze(1),labels)
            valid_loss = loss.item() * data.size(0)

        print(f'Epoch {e+1} \t\t Training Loss: {train_loss / len(training_generator)} \t\t Validation Loss: {valid_loss / len(validation_generator)}')
        if min_valid_loss > valid_loss:
            print(f'Validation Loss Decreased({min_valid_loss:.6f}--->{valid_loss:.6f}) \t Saving The Model')
            min_valid_loss = valid_loss
            # Saving State Dict
            torch.save(network.state_dict(), model_path)

And my data loader looks like:

import torch
import numpy as np

class DataGenerator(torch.utils.data.Dataset):
    'Characterizes a dataset for PyTorch'
    def __init__(self, list_IDs, labels):
        'Initialization'
        self.labels = labels
        self.list_IDs = list_IDs
    def __len__(self):
        'Denotes the total number of samples'
        return len(self.list_IDs)
    def __getitem__(self, index):
        'Generates one sample of data'
        # Select sample

        ID = self.list_IDs[index]
        # Load data and get label
        X = np.load(ID,allow_pickle=True)
        X = X.reshape((4,600,1))
        y = self.labels[ID]
        y_tensor = y
        x_tensor = torch.from_numpy(X)
        return x_tensor, y_tensor

My output looks like:

Training Loss: nan               validation loss:nan
Training Loss: nan               validation loss:nan
Training Loss: nan               validation loss:nan
Training Loss: nan               validation loss:nan

Can someone explain why this is happening? And how can I resolve this issue? Am I joining the two networks correctly?

Training loss and validation loss is nan through out using pytorch

0 Answers0