5 years ago · e354f07496
--- a/README.md
+++ b/README.md
@@ -1,2 +1,53 @@
 
				-# cs231n-project
			
 
				-CS231n project
			
 
				+# Kissing Detector
			
 
				+Detect kissing scenes in a movie using both audio and video features.
			
 
				+
			
 
				+Project for [Stanford CS231N](http://cs231n.stanford.edu)
			
 
				+
			
 
				+## Build dataset
			
 
				+```python
			
 
				+from pipeline import BuildDataset
			
 
				+
			
 
				+videos_and_labels = [
			
 
				+    # (file name in base_path, label) where label is 1 for kissing and 0 for not kissing
			
 
				+    ('movies_casino_royale_2006_kissing_1.mp4', 1),
			
 
				+    ('movies_casino_royale_2006_kissing_2.mp4', 1),
			
 
				+    ('movies_casino_royale_2006_kissing_3.mp4', 1),
			
 
				+    ('movies_casino_royale_2006_not_1.mp4', 0),
			
 
				+    ('movies_casino_royale_2006_not_2.mp4', 0),
			
 
				+    ('movies_casino_royale_2006_not_3.mp4', 0),
			
 
				+    
			
 
				+    ('movies_goldeneye_1995_kissing_1.mp4', 1),
			
 
				+    ('movies_goldeneye_1995_kissing_2.mp4', 1),
			
 
				+    ('movies_goldeneye_1995_kissing_3.mp4', 1),
			
 
				+    ('movies_goldeneye_1995_not_1.mp4', 0),
			
 
				+    ('movies_goldeneye_1995_not_2.mp4', 0),
			
 
				+    ('movies_goldeneye_1995_not_3.mp4', 0),
			
 
				+]
			
 
				+
			
 
				+builder = BuildDataset(base_path='path/to/movies',
			
 
				+                 videos_and_labels=videos_and_labels,
			
 
				+                 output_path='/path/to/output',
			
 
				+                 test_size=1 / 3)  # set aside 1 / 3 of data for validation
			
 
				+builder.build_dataset()
			
 
				+```
			
 
				+
			
 
				+### Data loader
			
 
				+
			
 
				+
			
 
				+## Explorations:
			
 
				+- ConvNet, VGGish, or both
			
 
				+- ConvNet architectures: ResNet, VGG, AlexNet, SqueezeNet, DenseNet
			
 
				+- With and without pre-training
			
 
				+- 
			
 
				+- (3DC) 
			
 
				+
			
 
				+## Diagnostics
			
 
				+- Saliency maps
			
 
				+- Class viz
			
 
				+- Confusion matrices
			
 
				+- Detected segments
			
 
				+- Failure examples
			
 
				+
			
 
				+## TODO
			
 
				+- Define experiments
			
 
				+- ...
			
--- a/conv.py
+++ b/conv.py
@@ -1,6 +1,6 @@
 
				+import torch
			
 
				 from torch import nn
			
 
				 from torchvision import models
			
 
				-import torch
			
 
				 
			
 
				 
			
 
				 def set_parameter_requires_grad(model, feature_extracting):
			
@@ -9,7 +9,10 @@ def set_parameter_requires_grad(model, feature_extracting):
 
				             param.requires_grad = False
			
 
				 
			
 
				 
			
 
				-def convnet_init(model_name, num_classes, feature_extract, use_pretrained=True):
			
 
				+def convnet_init(model_name: str,
			
 
				+                 num_classes: int,
			
 
				+                 feature_extract: bool,
			
 
				+                 use_pretrained: bool = True):
			
 
				     # Initialize these variables which will be set in this if statement. Each of these
			
 
				     #   variables is model specific.
			
 
				     model_ft = None
			
@@ -21,19 +24,19 @@ def convnet_init(model_name, num_classes, feature_extract, use_pretrained=True):
 
				         """
			
 
				         model_ft = models.resnet18(pretrained=use_pretrained)
			
 
				         set_parameter_requires_grad(model_ft, feature_extract)
			
 
				-        num_ftrs = model_ft.fc.in_features
			
 
				+        # num_ftrs = model_ft.fc.in_features
			
 
				         # model_ft.fc = nn.Linear(num_ftrs, num_classes)
			
 
				         model_ft.fc = nn.Identity()
			
 
				         input_size = 224
			
 
				-        output_size = model_ft(torch.rand((1, 3, input_size, input_size))).shape[1]
			
 
				 
			
 
				     elif model_name == "alexnet":
			
 
				         """ Alexnet
			
 
				         """
			
 
				         model_ft = models.alexnet(pretrained=use_pretrained)
			
 
				         set_parameter_requires_grad(model_ft, feature_extract)
			
 
				-        num_ftrs = model_ft.classifier[6].in_features
			
 
				-        model_ft.classifier[6] = nn.Linear(num_ftrs, num_classes)
			
 
				+        # num_ftrs = model_ft.classifier[6].in_features
			
 
				+        # model_ft.classifier[6] = nn.Linear(num_ftrs, num_classes)
			
 
				+        model_ft.classifier = nn.Identity()
			
 
				         input_size = 224
			
 
				 
			
 
				     elif model_name == "vgg":
			
@@ -41,8 +44,9 @@ def convnet_init(model_name, num_classes, feature_extract, use_pretrained=True):
 
				         """
			
 
				         model_ft = models.vgg11_bn(pretrained=use_pretrained)
			
 
				         set_parameter_requires_grad(model_ft, feature_extract)
			
 
				-        num_ftrs = model_ft.classifier[6].in_features
			
 
				-        model_ft.classifier[6] = nn.Linear(num_ftrs, num_classes)
			
 
				+        # num_ftrs = model_ft.classifier[6].in_features
			
 
				+        # model_ft.classifier[6] = nn.Linear(num_ftrs, num_classes)
			
 
				+        model_ft.fc = nn.Identity()
			
 
				         input_size = 224
			
 
				 
			
 
				     elif model_name == "squeezenet":
			
@@ -63,25 +67,29 @@ def convnet_init(model_name, num_classes, feature_extract, use_pretrained=True):
 
				         model_ft = models.densenet121(pretrained=use_pretrained)
			
 
				         set_parameter_requires_grad(model_ft, feature_extract)
			
 
				         num_ftrs = model_ft.classifier.in_features
			
 
				-        model_ft.classifier = nn.Linear(num_ftrs, num_classes)
			
 
				+        # model_ft.classifier = nn.Linear(num_ftrs, num_classes)
			
 
				+        model_ft.classifier = nn.Identity()
			
 
				         input_size = 224
			
 
				 
			
 
				-    elif model_name == "inception":
			
 
				-        """ Inception v3
			
 
				-        Be careful, expects (299,299) sized images and has auxiliary output
			
 
				-        """
			
 
				-        model_ft = models.inception_v3(pretrained=use_pretrained)
			
 
				-        set_parameter_requires_grad(model_ft, feature_extract)
			
 
				-        # Handle the auxiliary net
			
 
				-        num_ftrs = model_ft.AuxLogits.fc.in_features
			
 
				-        model_ft.AuxLogits.fc = nn.Linear(num_ftrs, num_classes)
			
 
				-        # Handle the primary net
			
 
				-        num_ftrs = model_ft.fc.in_features
			
 
				-        model_ft.fc = nn.Linear(num_ftrs, num_classes)
			
 
				-        input_size = 299
			
 
				+    # elif model_name == "inception":
			
 
				+    #     """ Inception v3
			
 
				+    #     Be careful, expects (299,299) sized images and has auxiliary output
			
 
				+    #     """
			
 
				+    #     model_ft = models.inception_v3(pretrained=use_pretrained)
			
 
				+    #     set_parameter_requires_grad(model_ft, feature_extract)
			
 
				+    #     # Handle the auxiliary net
			
 
				+    #     num_ftrs = model_ft.AuxLogits.fc.in_features
			
 
				+    #     model_ft.AuxLogits.fc = nn.Linear(num_ftrs, num_classes)
			
 
				+    #     # Handle the primary net
			
 
				+    #     num_ftrs = model_ft.fc.in_features
			
 
				+    #     # model_ft.fc = nn.Linear(num_ftrs, num_classes)
			
 
				+    #     model_ft.fc = nn.Identity()
			
 
				+    #     input_size = 299
			
 
				 
			
 
				     else:
			
 
				         print("Invalid model name, exiting...")
			
 
				         exit()
			
 
				 
			
 
				+    output_size = model_ft(torch.rand((1, 3, input_size, input_size))).shape[1]
			
 
				+
			
 
				     return model_ft, input_size, output_size
			
--- a/data.py
+++ b/data.py
@@ -154,6 +154,7 @@ class AudioVideo(data.Dataset):
 
				         return len(self.data)
			
 
				 
			
 
				     def __getitem__(self, idx: int) -> Tuple[torch.Tensor, torch.Tensor, int]:
			
 
				+        # output format:
			
 
				         # return (
			
 
				         #     torch.rand((1, 96, 64)),
			
 
				         #     torch.rand((3, 224, 224)),
			
--- a/dev3.ipynb
+++ b/dev3.ipynb
@@ -40,6 +40,63 @@
 
				     "batch_size = 32"
			
 
				    ]
			
 
				   },
			
 
				+  {
			
 
				+   "cell_type": "code",
			
 
				+   "execution_count": 45,
			
 
				+   "metadata": {},
			
 
				+   "outputs": [
			
 
				+    {
			
 
				+     "data": {
			
 
				+      "text/plain": [
			
 
				+       "0    tensor(0.5765, dtype=torch.float64)\n",
			
 
				+       "1    tensor(0.4118, dtype=torch.float64)\n",
			
 
				+       "2    tensor(0.6471, dtype=torch.float64)\n",
			
 
				+       "3    tensor(0.8353, dtype=torch.float64)\n",
			
 
				+       "4    tensor(0.7882, dtype=torch.float64)\n",
			
 
				+       "5    tensor(0.4588, dtype=torch.float64)\n",
			
 
				+       "6    tensor(0.5647, dtype=torch.float64)\n",
			
 
				+       "7    tensor(0.8118, dtype=torch.float64)\n",
			
 
				+       "8    tensor(0.8118, dtype=torch.float64)\n",
			
 
				+       "9    tensor(0.7059, dtype=torch.float64)\n",
			
 
				+       "dtype: object"
			
 
				+      ]
			
 
				+     },
			
 
				+     "execution_count": 45,
			
 
				+     "metadata": {},
			
 
				+     "output_type": "execute_result"
			
 
				+    }
			
 
				+   ],
			
 
				+   "source": [
			
 
				+    "pd.Series(history)"
			
 
				+   ]
			
 
				+  },
			
 
				+  {
			
 
				+   "cell_type": "code",
			
 
				+   "execution_count": 44,
			
 
				+   "metadata": {},
			
 
				+   "outputs": [
			
 
				+    {
			
 
				+     "data": {
			
 
				+      "text/plain": [
			
 
				+       "[tensor(0.5765, dtype=torch.float64),\n",
			
 
				+       " tensor(0.4118, dtype=torch.float64),\n",
			
 
				+       " tensor(0.6471, dtype=torch.float64),\n",
			
 
				+       " tensor(0.8353, dtype=torch.float64),\n",
			
 
				+       " tensor(0.7882, dtype=torch.float64),\n",
			
 
				+       " tensor(0.4588, dtype=torch.float64),\n",
			
 
				+       " tensor(0.5647, dtype=torch.float64),\n",
			
 
				+       " tensor(0.8118, dtype=torch.float64),\n",
			
 
				+       " tensor(0.8118, dtype=torch.float64),\n",
			
 
				+       " tensor(0.7059, dtype=torch.float64)]"
			
 
				+      ]
			
 
				+     },
			
 
				+     "execution_count": 44,
			
 
				+     "metadata": {},
			
 
				+     "output_type": "execute_result"
			
 
				+    }
			
 
				+   ],
			
 
				+   "source": []
			
 
				+  },
			
 
				   {
			
 
				    "cell_type": "code",
			
 
				    "execution_count": 37,
			
@@ -109,11 +166,11 @@
 
				     }
			
 
				    ],
			
 
				    "source": [
			
 
				-    "kd, history = train_kd(data_path_base,\n",
			
 
				-    "                       model_name,\n",
			
 
				-    "                       num_epochs,\n",
			
 
				-    "                       feature_extract,\n",
			
 
				-    "                       batch_size)"
			
 
				+    "kd, hist_acc, hist_f1 = train_kd(data_path_base,\n",
			
 
				+    "                                 model_name,\n",
			
 
				+    "                                 num_epochs,\n",
			
 
				+    "                                 feature_extract,\n",
			
 
				+    "                                 batch_size)"
			
 
				    ]
			
 
				   },
			
 
				   {
			
--- a/experiments.py
+++ b/experiments.py
--- a/kissing_detector.py
+++ b/kissing_detector.py
@@ -2,22 +2,46 @@ import torch
 
				 from torch import nn
			
 
				 import vggish
			
 
				 from conv import convnet_init
			
 
				+from typing import Optional
			
 
				 
			
 
				 
			
 
				 class KissingDetector(nn.Module):
			
 
				-    def __init__(self, model_name: str, num_classes: int, feature_extract: bool, use_pretrained: bool = True):
			
 
				+    def __init__(self,
			
 
				+                 conv_model_name: Optional[str],
			
 
				+                 num_classes: int,
			
 
				+                 feature_extract: bool,
			
 
				+                 use_pretrained: bool = True,
			
 
				+                 use_vggish: bool = True):
			
 
				         super(KissingDetector, self).__init__()
			
 
				-        conv, conv_input_size, conv_output_size = convnet_init(model_name, num_classes, feature_extract,
			
 
				-                                                               use_pretrained=use_pretrained)
			
 
				-        vggish_model, vggish_output_size = vggish.vggish(feature_extract)
			
 
				+        conv_output_size = 0
			
 
				+        vggish_output_size = 0
			
 
				+        conv_input_size = 0
			
 
				+        conv = None
			
 
				+        vggish_model = None
			
 
				+
			
 
				+        if conv_model_name:
			
 
				+            conv, conv_input_size, conv_output_size = convnet_init(conv_model_name,
			
 
				+                                                                   num_classes,
			
 
				+                                                                   feature_extract,
			
 
				+                                                                   use_pretrained)
			
 
				+        if use_vggish:
			
 
				+            vggish_model, vggish_output_size = vggish.vggish(feature_extract)
			
 
				+
			
 
				+        if not conv and not vggish_model:
			
 
				+            raise ValueError("Use VGGish, Conv, or both")
			
 
				+
			
 
				         self.conv_input_size = conv_input_size
			
 
				         self.conv = conv
			
 
				         self.vggish = vggish_model
			
 
				         self.combined = nn.Linear(vggish_output_size + conv_output_size, num_classes)
			
 
				 
			
 
				     def forward(self, audio: torch.Tensor, image: torch.Tensor):
			
 
				-        a = self.vggish(audio)
			
 
				-        c = self.conv(image)
			
 
				-        combined = torch.cat((c.view(c.size(0), -1), a.view(a.size(0), -1)), dim=1)
			
 
				-        out = self.combined(combined)
			
 
				-        return out
			
 
				+        a = self.vggish(audio) if self.vggish else None
			
 
				+        c = self.conv(image) if self.conv else None
			
 
				+
			
 
				+        if a and c:
			
 
				+            combined = torch.cat((c.view(c.size(0), -1), a.view(a.size(0), -1)), dim=1)
			
 
				+        else:
			
 
				+            combined = a if a else c
			
 
				+
			
 
				+        return self.combined(combined)
			
--- a/requirements.txt
+++ b/requirements.txt
@@ -2,8 +2,7 @@ torch
 
				 torchvision
			
 
				 resampy
			
 
				 soundfile
			
 
				-PIL
			
 
				-accimage
			
 
				+Pillow
			
 
				 numpy
			
 
				 moviepy
			
 
				-cv2
			
 
				+opencv-python
			
--- a/train.py
+++ b/train.py
@@ -1,6 +1,6 @@
 
				 import copy
			
 
				 import time
			
 
				-from typing import List, Tuple
			
 
				+from typing import List, Tuple, Optional
			
 
				 
			
 
				 import torch
			
 
				 import torch.optim as optim
			
@@ -26,16 +26,16 @@ def _get_params_to_update(model: nn.Module,
 
				 
			
 
				 
			
 
				 def train_kd(data_path_base: str,
			
 
				-             model_name: str,
			
 
				+             conv_model_name: Optional[str],
			
 
				              num_epochs: int,
			
 
				              feature_extract: bool,
			
 
				              batch_size: int,
			
 
				              num_workers: int = 4,
			
 
				              shuffle: bool = True,
			
 
				              lr: float = 0.001,
			
 
				-             momentum: float = 0.9) -> Tuple[nn.Module, List[torch.Tensor]]:
			
 
				+             momentum: float = 0.9) -> Tuple[nn.Module, List[float], List[float]]:
			
 
				     num_classes = 2
			
 
				-    kd = KissingDetector(model_name, num_classes, feature_extract)
			
 
				+    kd = KissingDetector(conv_model_name, num_classes, feature_extract)
			
 
				     params_to_update = _get_params_to_update(kd, feature_extract)
			
 
				 
			
 
				     datasets = {set_: AudioVideo(f'{data_path_base}/{set_}') for set_ in ['train', 'val']}
			
@@ -48,16 +48,16 @@ def train_kd(data_path_base: str,
 
				     # Setup the loss fxn
			
 
				     criterion = nn.CrossEntropyLoss()
			
 
				 
			
 
				-    model_ft, hist = train_model(kd,
			
 
				-                                 dataloaders_dict, criterion, optimizer_ft, num_epochs=num_epochs,
			
 
				-                                 is_inception=(model_name == "inception"))
			
 
				-    return model_ft, hist
			
 
				+    return train_model(kd,
			
 
				+                       dataloaders_dict, criterion, optimizer_ft, num_epochs=num_epochs,
			
 
				+                       is_inception=(conv_model_name == "inception"))
			
 
				 
			
 
				 
			
 
				 def train_model(model, dataloaders, criterion, optimizer, num_epochs=25, is_inception=False):
			
 
				     since = time.time()
			
 
				 
			
 
				     val_acc_history = []
			
 
				+    val_f1_history = []
			
 
				 
			
 
				     best_model_wts = copy.deepcopy(model.state_dict())
			
 
				     best_acc = 0.0
			
@@ -142,15 +142,16 @@ def train_model(model, dataloaders, criterion, optimizer, num_epochs=25, is_ince
 
				                 best_f1 = epoch_f1
			
 
				                 best_model_wts = copy.deepcopy(model.state_dict())
			
 
				             if phase == 'val':
			
 
				-                val_acc_history.append(epoch_acc)
			
 
				+                val_acc_history.append(float(epoch_acc))
			
 
				+                val_f1_history.append(float(epoch_f1))
			
 
				 
			
 
				         print()
			
 
				 
			
 
				     time_elapsed = time.time() - since
			
 
				     print('Training complete in {:.0f}m {:.0f}s'.format(time_elapsed // 60, time_elapsed % 60))
			
 
				-    print('Best val F1 : {:4f}'.format(best_f1))
			
 
				-    print('Best val Acc: {:4f}'.format(best_acc))
			
 
				+    print('Best val F1  : {:4f}'.format(best_f1))
			
 
				+    print('Best val Acc : {:4f}'.format(best_acc))
			
 
				 
			
 
				     # load best model weights
			
 
				     model.load_state_dict(best_model_wts)
			
 
				-    return model, val_acc_history
			
 
				+    return model, val_acc_history, val_f1_history