1

I tried to run all the computations on gpu, but the increase in the speed wasn't as big as i expected, also nvidia-smi gives this: nvidia-smi output, while program's running, so does pytorch "see" my gpu or not. If not, how can i fix this? I've install CUDA toolkit v11.1 and cuDNN. Merged what was in cuDNN into CUDA toolkit directory.

# In[1]:


import torch
import torch.nn.functional as F
import torchvision
import numpy as np
import matplotlib.pyplot as plt
from timeit import default_timer as timer 
from typing import Tuple, List, Type, Dict, Any


# In[2]:


torch.manual_seed(0)
np.random.seed(0)


# In[3]:


class Perceptron(torch.nn.Module):
    
    def __init__(self, 
                 input_resolution: Tuple[int, int] = (28, 28),
                 input_channels: int = 1, 
                 hidden_layer_features: List[int] = [256, 256, 256],
                 activation: Type[torch.nn.Module] = torch.nn.Tanh,
                 num_classes: int = 10):
        
        
        super().__init__()
        
        self.input_resolution = input_resolution
        self.input_channels = input_channels
        self.hidden_layer_features = hidden_layer_features
        self.activation = activation()
        self.num_classes = num_classes
        
        # Layers:
        self.fc1 = torch.nn.Linear(self.input_resolution[0] * self.input_resolution[1], 128)
        self.fc2 = torch.nn.Linear(128, self.num_classes)
        
    def forward(self, X : Type[torch.Tensor]) -> Type[torch.Tensor]:
        
        X = self.fc1(X)
        X = self.activation(X)
        
        X = self.fc2(X)
        output = F.softmax(X, dim = 1)
        
        return output


# In[4]:


if torch.cuda.is_available():
    device = torch.device("cuda:0")
else:
    device = torch.device('cpu')


# In[5]:


model = Perceptron().to(device)
print(model)
print('Total number of trainable parameters', 
      sum(p.numel() for p in model.parameters() if p.requires_grad))


# In[6]:


train_transforms = torchvision.transforms.Compose([torchvision.transforms.ToTensor()])
valid_transforms = torchvision.transforms.Compose([torchvision.transforms.ToTensor()])


# In[7]:


train_dataset = torchvision.datasets.MNIST(root='./mnist', 
                                           train=True, 
                                           download=True,
                                           transform=train_transforms)

valid_dataset = torchvision.datasets.MNIST(root='./mnist', 
                                         train=False, 
                                         download=True, 
                                         transform=valid_transforms)


# In[8]:


indices = np.random.randint(0, len(train_dataset), size=256)

fig, axes = plt.subplots(nrows=8, ncols=8, figsize=(10, 10))
for i, row in enumerate(axes):
    for j, ax in enumerate(row):
        sample_index = indices[i*16+j]
        sample, label = train_dataset[sample_index]
        ax.imshow(sample.cpu().numpy().transpose(1, 2, 0))
        ax.set_title(label)


# In[9]:


def train_single_epoch(model: torch.nn.Module,
                       optimizer: torch.optim.Optimizer, 
                       loss_function: torch.nn.Module, 
                       data_loader: torch.utils.data.DataLoader):
    
    
    for data in data_loader:
        
        X, y = data
        X, y = X.to(device), y.to(device)
        model.zero_grad()
        output = model(X.view(-1, 784))
        loss = loss_function(output, y)
        loss.backward()
        optimizer.step()


# In[10]:


def validate_single_epoch(model: torch.nn.Module,
                          loss_function: torch.nn.Module, 
                          data_loader: torch.utils.data.DataLoader):
    
    loss_total = 0
    accuracy_total = 0
    
    for data in data_loader:

        X, y = data
        X, y = X.to(device), y.to(device)
        output = model(X.view(-1, 784))
        loss = loss_function(output, y)
        loss_total += loss
        
        y_pred = output.argmax(dim = 1, keepdim=True).to(device)
        accuracy_total += y_pred.eq(y.view_as(y_pred)).sum().item()
        
    loss_avg = loss_total / len(data_loader.dataset)
    accuracy_avg = 100.0 * accuracy_total / len(data_loader.dataset)
    
    return {'loss' : loss_avg, 'accuracy' : accuracy_avg}


# In[11]:


def train_model(model: torch.nn.Module, 
                train_dataset: torch.utils.data.Dataset,
                valid_dataset: torch.utils.data.Dataset,
                loss_function: torch.nn.Module = torch.nn.CrossEntropyLoss(),
                optimizer_class: Type[torch.optim.Optimizer] = torch.optim,
                optimizer_params: Dict = {},
                initial_lr = 0.01,
                lr_scheduler_class: Any = torch.optim.lr_scheduler.ReduceLROnPlateau,
                lr_scheduler_params: Dict = {},
                batch_size = 64,
                max_epochs = 1000,
                early_stopping_patience = 20):
    
    
    optimizer = torch.optim.Adam(model.parameters(), lr=initial_lr, **optimizer_params)
    lr_scheduler = lr_scheduler_class(optimizer, **lr_scheduler_params)
    
    train_loader = torch.utils.data.DataLoader(train_dataset, shuffle=True, batch_size=batch_size)
    valid_loader = torch.utils.data.DataLoader(valid_dataset, batch_size=batch_size)

    best_valid_loss = None
    best_epoch = None
    
    for epoch in range(max_epochs):
        
        print(f'Epoch {epoch}')
        start = timer()
        train_single_epoch(model, optimizer, loss_function, train_loader)
        valid_metrics = validate_single_epoch(model, loss_function, valid_loader)
        print('time:', timer() - start)
        print(f'Validation metrics: \n{valid_metrics}')

        lr_scheduler.step(valid_metrics['loss'])
        
        if best_valid_loss is None or best_valid_loss > valid_metrics['loss']:
            print(f'Best model yet, saving')
            best_valid_loss = valid_metrics['loss']
            best_epoch = epoch
            torch.save(model, './best_model.pth')
            
        if epoch - best_epoch > early_stopping_patience:
            print('Early stopping triggered')
            return


# In[ ]:


train_model(model, 
            train_dataset=train_dataset, 
            valid_dataset=valid_dataset, 
            loss_function=torch.nn.CrossEntropyLoss(), 
            initial_lr=0.01, max_epochs = 500)
iacob
  • 14,010
  • 5
  • 54
  • 92

0 Answers0