I am making inference with a single image of size 224x224 on a vision transformer model (deit). However, I divided the image into 196 patches and manipulated the pixels of one patch to check its behaviour. Each patch is of size 16x16.
On feeding these patches to the model, I got the error: Input image size (16*16) doesn't match (224*224). Of course the model was trained on 224x224 image and needs same size. An idea is to combine these patches together into one complete image but I am having problem in getting through.
The single image shape :([1, 3, 224, 224]) The divided-into-patches image shape: [196, 16, 16, 3]
import torch
from models.deit import deit_small_patch16_224
import matplotlib.pyplot as plt
import numpy as np
from PIL import Image
import os
from torchvision.transforms import transforms as transforms
from torchvision.utils import make_grid
def into_patches(im, xPieces, yPieces):
imgwidth, imgheight = im.size
height = imgheight // yPieces
width = imgwidth // xPieces
#fig, axs = plt.subplots(yPieces, xPieces)
img_list = []
for i in range(0, yPieces):
for j in range(0, xPieces):
box = (j * width, i * height, (j + 1) * width, (i + 1) * height)
a = im.crop(box)
np_img = np.asarray(a)
if i ==6 and j ==5:
np_img.setflags(write=1)
np_img[:] =0
img_list.append(np_img)
return img_list
class_names = ['plane', 'car', 'bird', 'cat', 'deer', 'dog', 'frog', 'horse', 'ship', 'truck']
model = deit_small_patch16_224(pretrained=True, use_top_n_heads=8, use_patch_outputs=False)
checkpoint = torch.load("./checkpoint/deit224.t7")
state_dict = checkpoint["model"]
new_state_dict = {}
for key in state_dict:
new_key = '.'.join(key.split('.')[1:])
new_state_dict[new_key] = state_dict[key]
model.head = torch.nn.Linear(in_features=model.head.in_features, out_features=10)
model.load_state_dict(new_state_dict)
model.eval()
img = Image.open("bird.jpeg")
img = img.resize((224, 224), resample=0)
a = np.array(into_patches(img, 14, 14))
img_tensor = torch.tensor(a)
# print(img_tensor.shape)
with torch.no_grad():
output = model(img_tensor)
predicted_class = np.argmax(output)
print(predicted_class.item())
I get the following error:-
AssertionError: Input image size (16*3) doesn't match model (224*224).
Is there anyway to combine these 196 patches back into 224x224 image?