Browse Source

readme basics

Amir Ziai 5 years ago
parent
commit
e354f07496
8 changed files with 194 additions and 53 deletions
  1. 53 2
      README.md
  2. 30 22
      conv.py
  3. 1 0
      data.py
  4. 62 5
      dev3.ipynb
  5. 0 0
      experiments.py
  6. 33 9
      kissing_detector.py
  7. 2 3
      requirements.txt
  8. 13 12
      train.py

+ 53 - 2
README.md

@@ -1,2 +1,53 @@
-# cs231n-project
-CS231n project
+# Kissing Detector
+Detect kissing scenes in a movie using both audio and video features.
+
+Project for [Stanford CS231N](http://cs231n.stanford.edu)
+
+## Build dataset
+```python
+from pipeline import BuildDataset
+
+videos_and_labels = [
+    # (file name in base_path, label) where label is 1 for kissing and 0 for not kissing
+    ('movies_casino_royale_2006_kissing_1.mp4', 1),
+    ('movies_casino_royale_2006_kissing_2.mp4', 1),
+    ('movies_casino_royale_2006_kissing_3.mp4', 1),
+    ('movies_casino_royale_2006_not_1.mp4', 0),
+    ('movies_casino_royale_2006_not_2.mp4', 0),
+    ('movies_casino_royale_2006_not_3.mp4', 0),
+    
+    ('movies_goldeneye_1995_kissing_1.mp4', 1),
+    ('movies_goldeneye_1995_kissing_2.mp4', 1),
+    ('movies_goldeneye_1995_kissing_3.mp4', 1),
+    ('movies_goldeneye_1995_not_1.mp4', 0),
+    ('movies_goldeneye_1995_not_2.mp4', 0),
+    ('movies_goldeneye_1995_not_3.mp4', 0),
+]
+
+builder = BuildDataset(base_path='path/to/movies',
+                 videos_and_labels=videos_and_labels,
+                 output_path='/path/to/output',
+                 test_size=1 / 3)  # set aside 1 / 3 of data for validation
+builder.build_dataset()
+```
+
+### Data loader
+
+
+## Explorations:
+- ConvNet, VGGish, or both
+- ConvNet architectures: ResNet, VGG, AlexNet, SqueezeNet, DenseNet
+- With and without pre-training
+- 
+- (3DC) 
+
+## Diagnostics
+- Saliency maps
+- Class viz
+- Confusion matrices
+- Detected segments
+- Failure examples
+
+## TODO
+- Define experiments
+- ...

+ 30 - 22
conv.py

@@ -1,6 +1,6 @@
+import torch
 from torch import nn
 from torchvision import models
-import torch
 
 
 def set_parameter_requires_grad(model, feature_extracting):
@@ -9,7 +9,10 @@ def set_parameter_requires_grad(model, feature_extracting):
             param.requires_grad = False
 
 
-def convnet_init(model_name, num_classes, feature_extract, use_pretrained=True):
+def convnet_init(model_name: str,
+                 num_classes: int,
+                 feature_extract: bool,
+                 use_pretrained: bool = True):
     # Initialize these variables which will be set in this if statement. Each of these
     #   variables is model specific.
     model_ft = None
@@ -21,19 +24,19 @@ def convnet_init(model_name, num_classes, feature_extract, use_pretrained=True):
         """
         model_ft = models.resnet18(pretrained=use_pretrained)
         set_parameter_requires_grad(model_ft, feature_extract)
-        num_ftrs = model_ft.fc.in_features
+        # num_ftrs = model_ft.fc.in_features
         # model_ft.fc = nn.Linear(num_ftrs, num_classes)
         model_ft.fc = nn.Identity()
         input_size = 224
-        output_size = model_ft(torch.rand((1, 3, input_size, input_size))).shape[1]
 
     elif model_name == "alexnet":
         """ Alexnet
         """
         model_ft = models.alexnet(pretrained=use_pretrained)
         set_parameter_requires_grad(model_ft, feature_extract)
-        num_ftrs = model_ft.classifier[6].in_features
-        model_ft.classifier[6] = nn.Linear(num_ftrs, num_classes)
+        # num_ftrs = model_ft.classifier[6].in_features
+        # model_ft.classifier[6] = nn.Linear(num_ftrs, num_classes)
+        model_ft.classifier = nn.Identity()
         input_size = 224
 
     elif model_name == "vgg":
@@ -41,8 +44,9 @@ def convnet_init(model_name, num_classes, feature_extract, use_pretrained=True):
         """
         model_ft = models.vgg11_bn(pretrained=use_pretrained)
         set_parameter_requires_grad(model_ft, feature_extract)
-        num_ftrs = model_ft.classifier[6].in_features
-        model_ft.classifier[6] = nn.Linear(num_ftrs, num_classes)
+        # num_ftrs = model_ft.classifier[6].in_features
+        # model_ft.classifier[6] = nn.Linear(num_ftrs, num_classes)
+        model_ft.fc = nn.Identity()
         input_size = 224
 
     elif model_name == "squeezenet":
@@ -63,25 +67,29 @@ def convnet_init(model_name, num_classes, feature_extract, use_pretrained=True):
         model_ft = models.densenet121(pretrained=use_pretrained)
         set_parameter_requires_grad(model_ft, feature_extract)
         num_ftrs = model_ft.classifier.in_features
-        model_ft.classifier = nn.Linear(num_ftrs, num_classes)
+        # model_ft.classifier = nn.Linear(num_ftrs, num_classes)
+        model_ft.classifier = nn.Identity()
         input_size = 224
 
-    elif model_name == "inception":
-        """ Inception v3
-        Be careful, expects (299,299) sized images and has auxiliary output
-        """
-        model_ft = models.inception_v3(pretrained=use_pretrained)
-        set_parameter_requires_grad(model_ft, feature_extract)
-        # Handle the auxiliary net
-        num_ftrs = model_ft.AuxLogits.fc.in_features
-        model_ft.AuxLogits.fc = nn.Linear(num_ftrs, num_classes)
-        # Handle the primary net
-        num_ftrs = model_ft.fc.in_features
-        model_ft.fc = nn.Linear(num_ftrs, num_classes)
-        input_size = 299
+    # elif model_name == "inception":
+    #     """ Inception v3
+    #     Be careful, expects (299,299) sized images and has auxiliary output
+    #     """
+    #     model_ft = models.inception_v3(pretrained=use_pretrained)
+    #     set_parameter_requires_grad(model_ft, feature_extract)
+    #     # Handle the auxiliary net
+    #     num_ftrs = model_ft.AuxLogits.fc.in_features
+    #     model_ft.AuxLogits.fc = nn.Linear(num_ftrs, num_classes)
+    #     # Handle the primary net
+    #     num_ftrs = model_ft.fc.in_features
+    #     # model_ft.fc = nn.Linear(num_ftrs, num_classes)
+    #     model_ft.fc = nn.Identity()
+    #     input_size = 299
 
     else:
         print("Invalid model name, exiting...")
         exit()
 
+    output_size = model_ft(torch.rand((1, 3, input_size, input_size))).shape[1]
+
     return model_ft, input_size, output_size

+ 1 - 0
data.py

@@ -154,6 +154,7 @@ class AudioVideo(data.Dataset):
         return len(self.data)
 
     def __getitem__(self, idx: int) -> Tuple[torch.Tensor, torch.Tensor, int]:
+        # output format:
         # return (
         #     torch.rand((1, 96, 64)),
         #     torch.rand((3, 224, 224)),

+ 62 - 5
dev3.ipynb

@@ -40,6 +40,63 @@
     "batch_size = 32"
    ]
   },
+  {
+   "cell_type": "code",
+   "execution_count": 45,
+   "metadata": {},
+   "outputs": [
+    {
+     "data": {
+      "text/plain": [
+       "0    tensor(0.5765, dtype=torch.float64)\n",
+       "1    tensor(0.4118, dtype=torch.float64)\n",
+       "2    tensor(0.6471, dtype=torch.float64)\n",
+       "3    tensor(0.8353, dtype=torch.float64)\n",
+       "4    tensor(0.7882, dtype=torch.float64)\n",
+       "5    tensor(0.4588, dtype=torch.float64)\n",
+       "6    tensor(0.5647, dtype=torch.float64)\n",
+       "7    tensor(0.8118, dtype=torch.float64)\n",
+       "8    tensor(0.8118, dtype=torch.float64)\n",
+       "9    tensor(0.7059, dtype=torch.float64)\n",
+       "dtype: object"
+      ]
+     },
+     "execution_count": 45,
+     "metadata": {},
+     "output_type": "execute_result"
+    }
+   ],
+   "source": [
+    "pd.Series(history)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 44,
+   "metadata": {},
+   "outputs": [
+    {
+     "data": {
+      "text/plain": [
+       "[tensor(0.5765, dtype=torch.float64),\n",
+       " tensor(0.4118, dtype=torch.float64),\n",
+       " tensor(0.6471, dtype=torch.float64),\n",
+       " tensor(0.8353, dtype=torch.float64),\n",
+       " tensor(0.7882, dtype=torch.float64),\n",
+       " tensor(0.4588, dtype=torch.float64),\n",
+       " tensor(0.5647, dtype=torch.float64),\n",
+       " tensor(0.8118, dtype=torch.float64),\n",
+       " tensor(0.8118, dtype=torch.float64),\n",
+       " tensor(0.7059, dtype=torch.float64)]"
+      ]
+     },
+     "execution_count": 44,
+     "metadata": {},
+     "output_type": "execute_result"
+    }
+   ],
+   "source": []
+  },
   {
    "cell_type": "code",
    "execution_count": 37,
@@ -109,11 +166,11 @@
     }
    ],
    "source": [
-    "kd, history = train_kd(data_path_base,\n",
-    "                       model_name,\n",
-    "                       num_epochs,\n",
-    "                       feature_extract,\n",
-    "                       batch_size)"
+    "kd, hist_acc, hist_f1 = train_kd(data_path_base,\n",
+    "                                 model_name,\n",
+    "                                 num_epochs,\n",
+    "                                 feature_extract,\n",
+    "                                 batch_size)"
    ]
   },
   {

+ 0 - 0
experiments.py


+ 33 - 9
kissing_detector.py

@@ -2,22 +2,46 @@ import torch
 from torch import nn
 import vggish
 from conv import convnet_init
+from typing import Optional
 
 
 class KissingDetector(nn.Module):
-    def __init__(self, model_name: str, num_classes: int, feature_extract: bool, use_pretrained: bool = True):
+    def __init__(self,
+                 conv_model_name: Optional[str],
+                 num_classes: int,
+                 feature_extract: bool,
+                 use_pretrained: bool = True,
+                 use_vggish: bool = True):
         super(KissingDetector, self).__init__()
-        conv, conv_input_size, conv_output_size = convnet_init(model_name, num_classes, feature_extract,
-                                                               use_pretrained=use_pretrained)
-        vggish_model, vggish_output_size = vggish.vggish(feature_extract)
+        conv_output_size = 0
+        vggish_output_size = 0
+        conv_input_size = 0
+        conv = None
+        vggish_model = None
+
+        if conv_model_name:
+            conv, conv_input_size, conv_output_size = convnet_init(conv_model_name,
+                                                                   num_classes,
+                                                                   feature_extract,
+                                                                   use_pretrained)
+        if use_vggish:
+            vggish_model, vggish_output_size = vggish.vggish(feature_extract)
+
+        if not conv and not vggish_model:
+            raise ValueError("Use VGGish, Conv, or both")
+
         self.conv_input_size = conv_input_size
         self.conv = conv
         self.vggish = vggish_model
         self.combined = nn.Linear(vggish_output_size + conv_output_size, num_classes)
 
     def forward(self, audio: torch.Tensor, image: torch.Tensor):
-        a = self.vggish(audio)
-        c = self.conv(image)
-        combined = torch.cat((c.view(c.size(0), -1), a.view(a.size(0), -1)), dim=1)
-        out = self.combined(combined)
-        return out
+        a = self.vggish(audio) if self.vggish else None
+        c = self.conv(image) if self.conv else None
+
+        if a and c:
+            combined = torch.cat((c.view(c.size(0), -1), a.view(a.size(0), -1)), dim=1)
+        else:
+            combined = a if a else c
+
+        return self.combined(combined)

+ 2 - 3
requirements.txt

@@ -2,8 +2,7 @@ torch
 torchvision
 resampy
 soundfile
-PIL
-accimage
+Pillow
 numpy
 moviepy
-cv2
+opencv-python

+ 13 - 12
train.py

@@ -1,6 +1,6 @@
 import copy
 import time
-from typing import List, Tuple
+from typing import List, Tuple, Optional
 
 import torch
 import torch.optim as optim
@@ -26,16 +26,16 @@ def _get_params_to_update(model: nn.Module,
 
 
 def train_kd(data_path_base: str,
-             model_name: str,
+             conv_model_name: Optional[str],
              num_epochs: int,
              feature_extract: bool,
              batch_size: int,
              num_workers: int = 4,
              shuffle: bool = True,
              lr: float = 0.001,
-             momentum: float = 0.9) -> Tuple[nn.Module, List[torch.Tensor]]:
+             momentum: float = 0.9) -> Tuple[nn.Module, List[float], List[float]]:
     num_classes = 2
-    kd = KissingDetector(model_name, num_classes, feature_extract)
+    kd = KissingDetector(conv_model_name, num_classes, feature_extract)
     params_to_update = _get_params_to_update(kd, feature_extract)
 
     datasets = {set_: AudioVideo(f'{data_path_base}/{set_}') for set_ in ['train', 'val']}
@@ -48,16 +48,16 @@ def train_kd(data_path_base: str,
     # Setup the loss fxn
     criterion = nn.CrossEntropyLoss()
 
-    model_ft, hist = train_model(kd,
-                                 dataloaders_dict, criterion, optimizer_ft, num_epochs=num_epochs,
-                                 is_inception=(model_name == "inception"))
-    return model_ft, hist
+    return train_model(kd,
+                       dataloaders_dict, criterion, optimizer_ft, num_epochs=num_epochs,
+                       is_inception=(conv_model_name == "inception"))
 
 
 def train_model(model, dataloaders, criterion, optimizer, num_epochs=25, is_inception=False):
     since = time.time()
 
     val_acc_history = []
+    val_f1_history = []
 
     best_model_wts = copy.deepcopy(model.state_dict())
     best_acc = 0.0
@@ -142,15 +142,16 @@ def train_model(model, dataloaders, criterion, optimizer, num_epochs=25, is_ince
                 best_f1 = epoch_f1
                 best_model_wts = copy.deepcopy(model.state_dict())
             if phase == 'val':
-                val_acc_history.append(epoch_acc)
+                val_acc_history.append(float(epoch_acc))
+                val_f1_history.append(float(epoch_f1))
 
         print()
 
     time_elapsed = time.time() - since
     print('Training complete in {:.0f}m {:.0f}s'.format(time_elapsed // 60, time_elapsed % 60))
-    print('Best val F1 : {:4f}'.format(best_f1))
-    print('Best val Acc: {:4f}'.format(best_acc))
+    print('Best val F1  : {:4f}'.format(best_f1))
+    print('Best val Acc : {:4f}'.format(best_acc))
 
     # load best model weights
     model.load_state_dict(best_model_wts)
-    return model, val_acc_history
+    return model, val_acc_history, val_f1_history