FastAI Course Lecture 6 Part 3 Notes

Computer Vision
FastAI
Author

Kanav Sharma

Published

May 7, 2024

Call library, download data, create folder blah blah..

#hide
!pip install -Uqq fastbook
!pip install timm

import fastbook
fastbook.setup_book()
import timm

#hide
from fastbook import *
from fastai.vision.widgets import *
from fastai.vision.all import *

path = Path('/content')
untar_data(URLs.FOOD, data=path)

# actual path to train image folder
train_path = Path('/content/food-101/images')
test_path = Path('/content/food-101/test')

# Create Test folder

import os
import random
import shutil

def move_images_to_test(source_folder, test_folder, percentage=0.1):
    # Create the test folder if it doesn't exist
    os.makedirs(test_folder, exist_ok=True)

    # Iterate through each subfolder in the source folder
    for subfolder in os.listdir(source_folder):
        subfolder_path = os.path.join(source_folder, subfolder)

        # Check if it's a directory
        if os.path.isdir(subfolder_path):
            # Get a list of all image files in the subfolder
            image_files = [f for f in os.listdir(subfolder_path) if f.endswith('.jpg')]

            # Calculate the number of images to move
            num_images_to_move = int(len(image_files) * percentage)

            # Randomly select images to move
            images_to_move = random.sample(image_files, num_images_to_move)

            # Move selected images to the test folder
            for image in images_to_move:
                source_path = os.path.join(subfolder_path, image)
                dest_path = os.path.join(test_folder, image)
                shutil.move(source_path, dest_path)

if __name__ == "__main__":
    move_images_to_test(train_path, test_path, percentage=0.15)
Requirement already satisfied: timm in /opt/conda/lib/python3.10/site-packages (0.9.16)
Requirement already satisfied: torch in /opt/conda/lib/python3.10/site-packages (from timm) (2.1.2)
Requirement already satisfied: torchvision in /opt/conda/lib/python3.10/site-packages (from timm) (0.16.2)
Requirement already satisfied: pyyaml in /opt/conda/lib/python3.10/site-packages (from timm) (6.0.1)
Requirement already satisfied: huggingface_hub in /opt/conda/lib/python3.10/site-packages (from timm) (0.20.3)
Requirement already satisfied: safetensors in /opt/conda/lib/python3.10/site-packages (from timm) (0.4.2)
Requirement already satisfied: filelock in /opt/conda/lib/python3.10/site-packages (from huggingface_hub->timm) (3.13.1)
Requirement already satisfied: fsspec>=2023.5.0 in /opt/conda/lib/python3.10/site-packages (from huggingface_hub->timm) (2024.2.0)
Requirement already satisfied: requests in /opt/conda/lib/python3.10/site-packages (from huggingface_hub->timm) (2.31.0)
Requirement already satisfied: tqdm>=4.42.1 in /opt/conda/lib/python3.10/site-packages (from huggingface_hub->timm) (4.66.1)
Requirement already satisfied: typing-extensions>=3.7.4.3 in /opt/conda/lib/python3.10/site-packages (from huggingface_hub->timm) (4.9.0)
Requirement already satisfied: packaging>=20.9 in /opt/conda/lib/python3.10/site-packages (from huggingface_hub->timm) (21.3)
Requirement already satisfied: sympy in /opt/conda/lib/python3.10/site-packages (from torch->timm) (1.12)
Requirement already satisfied: networkx in /opt/conda/lib/python3.10/site-packages (from torch->timm) (3.2.1)
Requirement already satisfied: jinja2 in /opt/conda/lib/python3.10/site-packages (from torch->timm) (3.1.2)
Requirement already satisfied: numpy in /opt/conda/lib/python3.10/site-packages (from torchvision->timm) (1.26.4)
Requirement already satisfied: pillow!=8.3.*,>=5.3.0 in /opt/conda/lib/python3.10/site-packages (from torchvision->timm) (9.5.0)
Requirement already satisfied: pyparsing!=3.0.5,>=2.0.2 in /opt/conda/lib/python3.10/site-packages (from packaging>=20.9->huggingface_hub->timm) (3.1.1)
Requirement already satisfied: MarkupSafe>=2.0 in /opt/conda/lib/python3.10/site-packages (from jinja2->torch->timm) (2.1.3)
Requirement already satisfied: charset-normalizer<4,>=2 in /opt/conda/lib/python3.10/site-packages (from requests->huggingface_hub->timm) (3.3.2)
Requirement already satisfied: idna<4,>=2.5 in /opt/conda/lib/python3.10/site-packages (from requests->huggingface_hub->timm) (3.6)
Requirement already satisfied: urllib3<3,>=1.21.1 in /opt/conda/lib/python3.10/site-packages (from requests->huggingface_hub->timm) (1.26.18)
Requirement already satisfied: certifi>=2017.4.17 in /opt/conda/lib/python3.10/site-packages (from requests->huggingface_hub->timm) (2024.2.2)
Requirement already satisfied: mpmath>=0.19 in /opt/conda/lib/python3.10/site-packages (from sympy->torch->timm) (1.3.0)

100.00% [5686607872/5686607260 02:04<00:00]

dls = ImageDataLoaders.from_folder(path, valid_pct=0.2, seed=42,
    item_tfms=Resize(256, method='squish'),
    batch_tfms=aug_transforms(size=128, min_scale=0.75))

dls.show_batch(max_n=4)

Under-sampling

Previously, we had very big data to be trained on, which is why it took so long to train our model. Learning from that, we can undersample our training dataset so that we can get a picture of which model works best. Generally, if a model is performing well proportion of dataset, it will likely perform well on the whole dataset.

There are two ways of doing this :

  • We can take 5-10% of all types of food & then train our model

  • We can target 1 food type and train our model on it.

Problem with case 1 is that we would have so little that from every type of food that our model might not be able to understand it well. In case 2, we can significantly reduce the training time and computational resources required.

subfolders = [f.name for f in os.scandir(train_path) if f.is_dir()]
subfolder_count = len(subfolders)
print(subfolders)
['fish_and_chips', 'caprese_salad', 'strawberry_shortcake', 'pork_chop', 'edamame', 'macaroni_and_cheese', 'gnocchi', 'lobster_roll_sandwich', 'takoyaki', 'baklava', 'sushi', 'beef_tartare', 'miso_soup', 'steak', 'hot_dog', 'grilled_cheese_sandwich', 'greek_salad', 'crab_cakes', 'falafel', 'beet_salad', 'apple_pie', 'onion_rings', 'chocolate_mousse', 'risotto', 'chicken_wings', 'french_fries', 'pancakes', 'paella', 'chicken_quesadilla', 'gyoza', 'bread_pudding', 'beignets', 'carrot_cake', 'waffles', 'ceviche', 'huevos_rancheros', 'ravioli', 'sashimi', 'bibimbap', 'creme_brulee', 'spaghetti_bolognese', 'cheese_plate', 'oysters', 'filet_mignon', 'baby_back_ribs', 'fried_rice', 'ice_cream', 'tacos', 'cheesecake', 'foie_gras', 'shrimp_and_grits', 'macarons', 'poutine', 'french_onion_soup', 'deviled_eggs', 'grilled_salmon', 'eggs_benedict', 'croque_madame', 'seaweed_salad', 'churros', 'hummus', 'bruschetta', 'club_sandwich', 'ramen', 'clam_chowder', 'cup_cakes', 'hot_and_sour_soup', 'garlic_bread', 'breakfast_burrito', 'guacamole', 'lobster_bisque', 'spring_rolls', 'samosa', 'red_velvet_cake', 'pulled_pork_sandwich', 'escargots', 'chocolate_cake', 'spaghetti_carbonara', 'caesar_salad', 'hamburger', 'tuna_tartare', 'donuts', 'fried_calamari', 'mussels', 'omelette', 'panna_cotta', 'pad_thai', 'beef_carpaccio', 'pizza', 'nachos', 'chicken_curry', 'pho', 'tiramisu', 'frozen_yogurt', 'peking_duck', 'prime_rib', 'cannoli', 'dumplings', 'french_toast', 'lasagna', 'scallops']
# let's randomly take baby_back_ribs folder for training
trn_path = train_path/'baby_back_ribs'
tst_files = get_image_files(test_path).sorted()

GPU Problem

In the previous file, we encountered a problem regarding GPU, where we run out of memory & have to wait to till our memory was cleared by Kaggle(on Saturday). We can use Gradient accumulation or Half-Precision floating point to save from future GPU constraints. Regarding Half-Precision floating point, we tested it in our first notebook of this series and observed minimal change in performance.

Gradient Accumulation

Working of Gradient Accumulation :

  • Forward pass: Input data is fed through the model to compute predictions.

  • Backward pass: Gradients are computed by back-propagating the error through the network.

  • Gradients are accumulated over multiple mini-batches.

  • Model parameters are updated after a certain number of mini-batches.

By accumulating gradients over multiple batches,it allows to simulate the effects of a larger batch size without exceeding the available memory.

However, there is a catch that it can have Impact on Training Time. While increasing accumulation can save GPU memory, it may also slow down the training process. The model parameters are updated less frequently, potentially prolonging the convergence time.

fine_tune() vs fit_one_cycle()

fine_tune() It uses transfer learning, where it take a pre-trained model (on ImageNet) & fine-tune it on a specific dataset. Idea is to leverage features learned by the pre-trained model & adapt them to new dataset.

It uses ‘discriminative learning rates’, where earlier layers are trained with lower learning rates (taking more time but understand data better) to avoid disrupting the general features they have learned. In contrast, later layers are trained with higher learning rates to adapt more quickly to the new tasks.

fit_one_cycle() It used for training a model from scratch or for further fine tuning already fine tuned model.

  • It starts with low learning rate & gradually increases it over the courses of first half of learning rate.

  • In second half it decreases.

  • This cyclical pattern of learning rates is repeated for no of epochs specified.

fine_tune() is primarily used for transfer learning, leveraging pre-trained models & adapting them to new data. While fit_one_cycle() is used for training models from scratch or further fine-tuning them using a cyclical learning rate schedule.

fine_tune() is faster since it doesn’t do an initial fine-tuning of the head.

def train(arch, size, item=Resize(480, method='squish'), accum=1, finetune=True, epochs=5):
    dls = ImageDataLoaders.from_folder(trn_path, valid_pct=0.2, item_tfms=item,
        batch_tfms=aug_transforms(size=size, min_scale=0.75), bs=64//accum)
    cbs = GradientAccumulation(64) if accum else []
    learn = vision_learner(dls, arch, metrics=error_rate, cbs=cbs).to_fp16()
    if finetune:
        learn.fine_tune(epochs, 0.01)
        return learn.tta(dl=dls.test_dl(tst_files))
    else:
        learn.unfreeze()
        learn.fit_one_cycle(epochs, 0.01)
                        

Check the available GPU memory on Kaggle.

import torch

def check_gpu_memory():
    if torch.cuda.is_available():
        device = torch.device("cuda")
        total_memory = torch.cuda.get_device_properties(device).total_memory
        reserved_memory = torch.cuda.memory_reserved(device)
        allocated_memory = torch.cuda.memory_allocated(device)
        free_memory = total_memory - reserved_memory - allocated_memory

        print(f"Total GPU memory: {total_memory / (1024 ** 3):.2f} GB")
        print(f"Reserved GPU memory: {reserved_memory / (1024 ** 3):.2f} GB")
        print(f"Allocated GPU memory: {allocated_memory / (1024 ** 3):.2f} GB")
        print(f"Free GPU memory: {free_memory / (1024 ** 3):.2f} GB")
    else:
        print("GPU not available.")

# Call the function to check GPU memory
check_gpu_memory()
Total GPU memory: 14.75 GB
Reserved GPU memory: 0.00 GB
Allocated GPU memory: 0.00 GB
Free GPU memory: 14.75 GB

Impact of Gradient Accumulation

train('convnext_small_in22k', 128, epochs=1, accum=1, finetune=False)
/opt/conda/lib/python3.10/site-packages/timm/models/_factory.py:117: UserWarning: Mapping deprecated model name convnext_small_in22k to current convnext_small.fb_in22k.
  model = create_fn(



model.safetensors:   0%|          | 0.00/265M [00:00<?, ?B/s]
epoch train_loss valid_loss error_rate time
0 0.000000 0.000000 0.000000 00:08

**It took 0:08 In GPU T4*2 and 0:22 in GPU P100**

Memory Consumption and Clearning It after usage

import gc
def report_gpu():
    print(torch.cuda.list_gpu_processes())
    gc.collect()
    torch.cuda.empty_cache()
report_gpu()
GPU:0
process       2200 uses     3250.000 MB GPU memory

So with accum=1 the GPU used around 3GB RAM. Let’s try accum=2:

train('convnext_small_in22k', 128, epochs=1, accum=2, finetune=False)
print("Report GPU:")
print(report_gpu())

print("\nGPU_Memory:")
print(check_gpu_memory())
epoch train_loss valid_loss error_rate time
0 0.000000 0.000000 0.000000 00:06
Report GPU:
GPU:0
process       2200 uses     2200.000 MB GPU memory
None

GPU_Memory:
Total GPU memory: 14.75 GB
Reserved GPU memory: 0.14 GB
Allocated GPU memory: 0.02 GB
Free GPU memory: 14.59 GB
None

As we can see that, the RAM usage has now gone down to 2GB. It’s not halved since there’s other overhead involved (for larger models this overhead is likely to be relatively lower).

Let’s try 4:

train('convnext_small_in22k', 128, epochs=1, accum=4, finetune=False)
print("Report GPU:")
print(report_gpu())

print("\nGPU_Memory:")
print(check_gpu_memory())
epoch train_loss valid_loss error_rate time
0 0.000000 0.000000 0.000000 00:08
Report GPU:
GPU:0
process       2200 uses     1664.000 MB GPU memory
None

GPU_Memory:
Total GPU memory: 14.75 GB
Reserved GPU memory: 0.14 GB
Allocated GPU memory: 0.02 GB
Free GPU memory: 14.59 GB
None

We are down to half of original version

Memory Usage of Every Model

Let’s test this approach on all models that we want to evaluate and determine the optimal value for gradient accumulation. Kaggle provides a 16 GB GPU, and our goal is to fit all of our architectures within this constraint.

convnext_large_in22k make GPU Crash always, so let’s tone it down to convnext_base_in22k

train('convnext_base_in22k', 256, epochs=2, accum=1, finetune=False)
print("Report GPU:")
print(report_gpu())

print("\nGPU_Memory:")
print(check_gpu_memory())
/opt/conda/lib/python3.10/site-packages/timm/models/_factory.py:117: UserWarning: Mapping deprecated model name convnext_base_in22k to current convnext_base.fb_in22k.
  model = create_fn(



model.safetensors:   0%|          | 0.00/440M [00:00<?, ?B/s]
epoch train_loss valid_loss error_rate time
0 0.000000 0.000000 0.000000 00:12
1 0.000000 0.000000 0.000000 00:12
Report GPU:
GPU:0
process       2200 uses    12246.000 MB GPU memory
None

GPU_Memory:
Total GPU memory: 14.75 GB
Reserved GPU memory: 0.14 GB
Allocated GPU memory: 0.02 GB
Free GPU memory: 14.59 GB
None
train('convnext_base_in22k', 256, epochs=2, accum=2, finetune=False)
print("Report GPU:")
print(report_gpu())

print("\nGPU_Memory:")
print(check_gpu_memory())
epoch train_loss valid_loss error_rate time
0 0.000000 0.000000 0.000000 00:12
1 0.000000 0.000000 0.000000 00:12
Report GPU:
GPU:0
process       2200 uses     6988.000 MB GPU memory
None

GPU_Memory:
Total GPU memory: 14.75 GB
Reserved GPU memory: 0.14 GB
Allocated GPU memory: 0.02 GB
Free GPU memory: 14.59 GB
None
train('convnext_base_in22k', 256, epochs=2, accum=4, finetune=False)
print("Report GPU:")
print(report_gpu())

print("\nGPU_Memory:")
print(check_gpu_memory())
epoch train_loss valid_loss error_rate time
0 0.000000 0.000000 0.000000 00:13
1 0.000000 0.000000 0.000000 00:12
Report GPU:
GPU:0
process       2200 uses     4360.000 MB GPU memory
None

GPU_Memory:
Total GPU memory: 14.75 GB
Reserved GPU memory: 0.14 GB
Allocated GPU memory: 0.02 GB
Free GPU memory: 14.59 GB
None

With accum = 4 there 1/3 of the memory consumption than original and also there not very high change in time taken

vit_base which is a transformer

train('vit_base_patch16_224', 224, epochs=2, accum=4, finetune=False)
print("Report GPU:")
print(report_gpu())

print("\nGPU_Memory:")
print(check_gpu_memory())
model.safetensors:   0%|          | 0.00/346M [00:00<?, ?B/s]
epoch train_loss valid_loss error_rate time
0 0.000000 0.000000 0.000000 00:09
1 0.000000 0.000000 0.000000 00:08
Report GPU:
GPU:0
process       2200 uses     2998.000 MB GPU memory
None

GPU_Memory:
Total GPU memory: 14.75 GB
Reserved GPU memory: 0.14 GB
Allocated GPU memory: 0.02 GB
Free GPU memory: 14.59 GB
None

Scaling It Up!, Training on full data

Let’s create dictonary of all the required models & the preprocessing techinque like crop,squish etc.

trn_path = train_path

models = {
    'convnext_base_in22k': {
        (Resize(480), (224)),
    }, 'vit_base_patch16_224': {
        (Resize(480, method='squish'), 224),
        (Resize(480), 224),
    }
}
models.items()
dict_items([('convnext_base_in22k', {(Resize -- {'size': (480, 480), 'method': 'crop', 'pad_mode': 'reflection', 'resamples': (<Resampling.BILINEAR: 2>, <Resampling.NEAREST: 0>), 'p': 1.0}:
encodes: (Image,object) -> encodes
(TensorBBox,object) -> encodes
(TensorPoint,object) -> encodes
decodes: , 224)}), ('vit_base_patch16_224', {(Resize -- {'size': (480, 480), 'method': 'squish', 'pad_mode': 'reflection', 'resamples': (<Resampling.BILINEAR: 2>, <Resampling.NEAREST: 0>), 'p': 1.0}:
encodes: (Image,object) -> encodes
(TensorBBox,object) -> encodes
(TensorPoint,object) -> encodes
decodes: , 224), (Resize -- {'size': (480, 480), 'method': 'crop', 'pad_mode': 'reflection', 'resamples': (<Resampling.BILINEAR: 2>, <Resampling.NEAREST: 0>), 'p': 1.0}:
encodes: (Image,object) -> encodes
(TensorBBox,object) -> encodes
(TensorPoint,object) -> encodes
decodes: , 224)})])

Append each set of TTA predictions on the test set into a list called tta_res

tta_res = []

for arch,details in models.items():
    for item,size in details:
        print('---',arch)
        print(size)
        print(item.name)
        tta_res.append(train(arch, size, item=item, accum=4)) #, epochs=1))
        gc.collect()
        torch.cuda.empty_cache()
        
--- convnext_base_in22k
224
Resize -- {'size': (480, 480), 'method': 'crop', 'pad_mode': 'reflection', 'resamples': (<Resampling.BILINEAR: 2>, <Resampling.NEAREST: 0>), 'p': 1.0}
epoch train_loss valid_loss error_rate time
0 0.921503 0.649621 0.179557 12:54
epoch train_loss valid_loss error_rate time
0 0.623931 0.491215 0.132964 16:42
1 0.590722 0.472942 0.128422 16:56
2 0.414496 0.423165 0.111881 16:38
3 0.234155 0.403548 0.100291 17:14
4 0.168659 0.408064 0.099418 16:41
--- vit_base_patch16_224
224
Resize -- {'size': (480, 480), 'method': 'squish', 'pad_mode': 'reflection', 'resamples': (<Resampling.BILINEAR: 2>, <Resampling.NEAREST: 0>), 'p': 1.0}
epoch train_loss valid_loss error_rate time
0 1.048016 0.767763 0.210891 10:24
epoch train_loss valid_loss error_rate time
0 0.901554 0.667587 0.179441 13:21
1 0.723949 0.652103 0.176296 13:18
2 0.525216 0.550479 0.146302 13:19
3 0.276363 0.497780 0.126616 13:16
4 0.183772 0.490051 0.119802 13:15
--- vit_base_patch16_224
224
Resize -- {'size': (480, 480), 'method': 'crop', 'pad_mode': 'reflection', 'resamples': (<Resampling.BILINEAR: 2>, <Resampling.NEAREST: 0>), 'p': 1.0}
epoch train_loss valid_loss error_rate time
0 1.054235 0.758079 0.205708 10:19
epoch train_loss valid_loss error_rate time
0 0.881566 0.701312 0.193710 13:15
1 0.741848 0.650074 0.174490 13:15
2 0.547625 0.550333 0.144904 13:15
3 0.274780 0.499551 0.125859 13:13
4 0.173512 0.485806 0.118987 13:15

Save the Model

save_pickle('/kaggle/working/Lecture6_Part3_tta_res.pkl', tta_res)

Ensemble

Learner.tta returns predictions and targets for each rows. We just want the predictions

 tta_prs = first(zip(*tta_res))
tta_prs
(tensor([[1.4376e-09, 6.0367e-10, 8.9613e-10,  ..., 5.5386e-09, 8.7931e-08, 4.2121e-10],
         [4.7340e-05, 7.8465e-06, 1.3157e-05,  ..., 3.5937e-05, 6.6248e-06, 2.0246e-06],
         [2.3749e-05, 2.8122e-07, 8.3784e-08,  ..., 6.6318e-06, 5.9989e-08, 9.9459e-07],
         ...,
         [1.9584e-06, 1.3206e-07, 3.0607e-06,  ..., 1.4922e-07, 5.0935e-09, 8.6562e-08],
         [4.8399e-03, 1.2087e-04, 4.5856e-04,  ..., 5.0185e-06, 9.4111e-05, 1.8684e-05],
         [5.0047e-06, 4.3755e-05, 1.8506e-06,  ..., 6.1585e-06, 5.6130e-07, 1.9855e-06]]),
 tensor([[3.4493e-08, 5.3500e-09, 1.6126e-07,  ..., 3.8252e-08, 1.5104e-06, 2.6966e-09],
         [6.6286e-03, 7.0347e-05, 3.0234e-05,  ..., 1.9852e-04, 4.9287e-05, 1.0562e-05],
         [3.3405e-08, 9.6962e-08, 1.1187e-08,  ..., 7.3134e-08, 6.8583e-09, 2.5434e-08],
         ...,
         [3.6105e-09, 2.5499e-09, 1.0355e-07,  ..., 4.7640e-10, 4.5013e-08, 5.3833e-10],
         [1.8239e-06, 1.0662e-06, 9.2194e-07,  ..., 7.3355e-07, 1.3718e-07, 8.0097e-07],
         [8.3010e-08, 1.5209e-06, 3.5702e-07,  ..., 2.3531e-07, 7.9341e-09, 3.4715e-09]]),
 tensor([[2.3078e-09, 5.1558e-09, 2.3751e-09,  ..., 1.5020e-08, 2.7991e-08, 4.2394e-10],
         [2.2522e-02, 1.2250e-05, 2.0199e-05,  ..., 1.2231e-04, 7.0627e-06, 2.2908e-06],
         [1.0235e-07, 1.3238e-08, 4.5449e-09,  ..., 8.5051e-09, 9.0174e-10, 2.8542e-08],
         ...,
         [1.2824e-07, 1.0214e-08, 1.7250e-07,  ..., 3.6925e-09, 3.7722e-07, 2.4337e-09],
         [1.6956e-05, 5.6481e-06, 5.4464e-06,  ..., 1.2303e-06, 1.3186e-05, 2.0320e-04],
         [4.9362e-08, 1.2064e-06, 1.6594e-06,  ..., 1.0572e-08, 2.2195e-08, 4.6684e-09]]))

Ensemble is a model which is combination of multiple models. Bagging,Boosting are it’s types.Those are bit complicated & we will stick to simple version that is averaging them out.

avg_pr = torch.stack(tta_prs).mean(0)
avg_pr.shape
torch.Size([15150, 101])

Test Data Set

dls = ImageDataLoaders.from_folder(trn_path, valid_pct=0.2, item_tfms=Resize(480, method='squish'),
    batch_tfms=aug_transforms(size=224, min_scale=0.75))
idxs = avg_pr.argmax(dim=1)
idxs
tensor([57, 71, 24,  ..., 32, 47, 38])
vocab = np.array(dls.vocab)
vocab
array(['apple_pie', 'baby_back_ribs', 'baklava', 'beef_carpaccio', 'beef_tartare', 'beet_salad', 'beignets', 'bibimbap', 'bread_pudding', 'breakfast_burrito', 'bruschetta', 'caesar_salad', 'cannoli',
       'caprese_salad', 'carrot_cake', 'ceviche', 'cheese_plate', 'cheesecake', 'chicken_curry', 'chicken_quesadilla', 'chicken_wings', 'chocolate_cake', 'chocolate_mousse', 'churros',
       'clam_chowder', 'club_sandwich', 'crab_cakes', 'creme_brulee', 'croque_madame', 'cup_cakes', 'deviled_eggs', 'donuts', 'dumplings', 'edamame', 'eggs_benedict', 'escargots', 'falafel',
       'filet_mignon', 'fish_and_chips', 'foie_gras', 'french_fries', 'french_onion_soup', 'french_toast', 'fried_calamari', 'fried_rice', 'frozen_yogurt', 'garlic_bread', 'gnocchi', 'greek_salad',
       'grilled_cheese_sandwich', 'grilled_salmon', 'guacamole', 'gyoza', 'hamburger', 'hot_and_sour_soup', 'hot_dog', 'huevos_rancheros', 'hummus', 'ice_cream', 'lasagna', 'lobster_bisque',
       'lobster_roll_sandwich', 'macaroni_and_cheese', 'macarons', 'miso_soup', 'mussels', 'nachos', 'omelette', 'onion_rings', 'oysters', 'pad_thai', 'paella', 'pancakes', 'panna_cotta',
       'peking_duck', 'pho', 'pizza', 'pork_chop', 'poutine', 'prime_rib', 'pulled_pork_sandwich', 'ramen', 'ravioli', 'red_velvet_cake', 'risotto', 'samosa', 'sashimi', 'scallops', 'seaweed_salad',
       'shrimp_and_grits', 'spaghetti_bolognese', 'spaghetti_carbonara', 'spring_rolls', 'steak', 'strawberry_shortcake', 'sushi', 'tacos', 'takoyaki', 'tiramisu', 'tuna_tartare', 'waffles'],
      dtype='<U23')
tst_files = get_image_files(test_path).sorted()
filenames = [path.name for path in tst_files]
ss = pd.DataFrame({'image_id': filenames})
ss['label'] = vocab[idxs]
ss
image_id label
0 1000314.jpg hummus
1 1000412.jpg paella
2 1000873.jpg clam_chowder
3 100127.jpg spaghetti_bolognese
4 1001332.jpg panna_cotta
15145 999118.jpg french_onion_soup
15146 999178.jpg cup_cakes
15147 999236.jpg dumplings
15148 999449.jpg gnocchi
15149 999908.jpg fish_and_chips

15150 rows × 2 columns

Save the file

ss.to_csv('/kaggle/working/Subm_Part3.csv', index=False)

Concluding Remarks

convnext_base worked way better than convnext_tiny and significantly, better than both data augmentated variants of the transformer models ViT. And in the end we created a ensemble model by averaging them all.