4 years ago · 98455f0e8f
--- a/README.md
+++ b/README.md
@@ -3,7 +3,16 @@ Detect kissing scenes in a movie using both audio and video features.
 
				 
			
 
				 Project for [Stanford CS231N](http://cs231n.stanford.edu)
			
 
				 
			
 
				+## Running the code
			
 
				+Use Python 3.6+
			
 
				+```bash
			
 
				+python3 experiments.py
			
 
				+```
			
 
				+
			
 
				+this will run the experiments in `params.py` specified by the `experiments` dictionary.
			
 
				+
			
 
				 ## Build dataset
			
 
				+The following will build the dataset for training. You need to provide path to video segments.
			
 
				 ```python
			
 
				 from pipeline import BuildDataset
			
 
				 
			
@@ -31,27 +40,6 @@ builder = BuildDataset(base_path='path/to/movies',
 
				 builder.build_dataset()
			
 
				 ```
			
 
				 
			
 
				-### Data loader
			
 
				-
			
 
				-
			
 
				-## Explorations:
			
 
				-- ConvNet, VGGish, or both
			
 
				-- ConvNet architectures: ResNet, VGG, AlexNet, SqueezeNet, DenseNet
			
 
				-- With and without pre-training
			
 
				-- 
			
 
				-- (3DC) 
			
 
				-
			
 
				-## Diagnostics
			
 
				-- Saliency maps
			
 
				-- Class viz
			
 
				-- Confusion matrices
			
 
				-- Detected segments
			
 
				-- Failure examples
			
 
				-
			
 
				-## TODO
			
 
				-- Qual
			
 
				-    - Saliency map
			
 
				-    - class viz
			
 
				-    - Error examples
			
 
				-    - Audio?
			
 
				-- 3DC
			
 
				+## Heavily used the following resources:
			
 
				+- [Video Classification Using 3D ResNet](https://github.com/kenshohara/video-classification-3d-cnn-pytorch)
			
 
				+- [CS231N assignment 3](http://cs231n.github.io/assignments2019/assignment3/)
			
--- a/conv.py
+++ b/conv.py
@@ -1,3 +1,4 @@
 
				+# adapted from PyTorch tutorials
			
 
				 import torch
			
 
				 from torch import nn
			
 
				 from torchvision import models
			
--- a/dev/dev4.ipynb
+++ b/dev/dev4.ipynb
--- a/dev/dev5.ipynb
+++ b/dev/dev5.ipynb
--- a/dev/dev6.ipynb
+++ b/dev/dev6.ipynb
--- a/dev/dev7.ipynb
+++ b/dev/dev7.ipynb
--- a/dev/dev8.ipynb
+++ b/dev/dev8.ipynb
--- a/experiments.py
+++ b/experiments.py
@@ -70,5 +70,5 @@ class ExperimentRunner:
 
				 
			
 
				 
			
 
				 if __name__ == '__main__':
			
 
				-    experiment1 = ExperimentRunner(params.experiment1_test, n_jobs=params.n_jobs)
			
 
				+    experiment1 = ExperimentRunner(params.experiments, n_jobs=params.n_jobs)
			
 
				     experiment1.run()
			
--- a/params.py
+++ b/params.py
@@ -34,7 +34,7 @@ experiment_test_3d = {
 
				 
			
 
				 experiments = {
			
 
				     'data_path_base': {data_path_base},
			
 
				-    'conv_model_name': {'resnet', None},  # vgg
			
 
				+    'conv_model_name': {'resnet', None, 'vgg', 'densenet', 'squeezenet'},
			
 
				     'num_epochs': {10},
			
 
				     'feature_extract': {True, False},
			
 
				     'batch_size': {64},
			
--- a/qualitative.py
+++ b/qualitative.py
@@ -1,11 +1,15 @@
 
				+# adapted from http://cs231n.github.io/assignments2019/assignment3/
			
 
				+import random
			
 
				 from typing import Dict, List
			
 
				 
			
 
				 import matplotlib.pyplot as plt
			
 
				+import numpy as np
			
 
				 import torch
			
 
				 from PIL import Image
			
 
				+from scipy.ndimage.filters import gaussian_filter1d
			
 
				 from torch import nn
			
 
				-import numpy as np
			
 
				 
			
 
				+import params
			
 
				 from pipeline import BuildDataset
			
 
				 
			
 
				 
			
@@ -117,7 +121,15 @@ class QualitativeAnalysis:
 
				             X = torch.cat([bottom, top], dim=2)
			
 
				         return X
			
 
				 
			
 
				-    def create_class_visualization(target_y, model, dtype, **kwargs):
			
 
				+    @staticmethod
			
 
				+    def _blur_image(X, sigma=1):
			
 
				+        X_np = X.cpu().clone().numpy()
			
 
				+        X_np = gaussian_filter1d(X_np, sigma, axis=2)
			
 
				+        X_np = gaussian_filter1d(X_np, sigma, axis=3)
			
 
				+        X.copy_(torch.Tensor(X_np).type_as(X))
			
 
				+        return X
			
 
				+
			
 
				+    def create_class_visualization(self, target_y, model, dtype, a, **kwargs):
			
 
				         """
			
 
				         Generate an image to maximize the score of target_y under a pretrained model.
			
 
				 
			
@@ -134,6 +146,10 @@ class QualitativeAnalysis:
 
				         - max_jitter: How much to gjitter the image as an implicit regularizer
			
 
				         - show_every: How often to show the intermediate result
			
 
				         """
			
 
				+
			
 
				+        def deprocess(x):
			
 
				+            return BuildDataset.transform_reverse(x.squeeze(0))
			
 
				+
			
 
				         model.type(dtype)
			
 
				         l2_reg = kwargs.pop('l2_reg', 1e-3)
			
 
				         learning_rate = kwargs.pop('learning_rate', 25)
			
@@ -148,44 +164,30 @@ class QualitativeAnalysis:
 
				         for t in range(num_iterations):
			
 
				             # Randomly jitter the image a bit; this gives slightly nicer results
			
 
				             ox, oy = random.randint(0, max_jitter), random.randint(0, max_jitter)
			
 
				-            img.data.copy_(jitter(img.data, ox, oy))
			
 
				-
			
 
				-            ########################################################################
			
 
				-            # TODO: Use the model to compute the gradient of the score for the     #
			
 
				-            # class target_y with respect to the pixels of the image, and make a   #
			
 
				-            # gradient step on the image using the learning rate. Don't forget the #
			
 
				-            # L2 regularization term!                                              #
			
 
				-            # Be very careful about the signs of elements in your code.            #
			
 
				-            ########################################################################
			
 
				-            # *****START OF YOUR CODE (DO NOT DELETE/MODIFY THIS LINE)*****
			
 
				-
			
 
				-            target = model(img)[0, target_y]
			
 
				+            img.data.copy_(self.jitter(img.data, ox, oy))
			
 
				+
			
 
				+            target = model(a, img)[0, target_y]
			
 
				             target.backward()
			
 
				             g = img.grad.data
			
 
				             g -= 2 * l2_reg * img.data
			
 
				             img.data += learning_rate * (g / g.norm())
			
 
				             img.grad.zero_()
			
 
				 
			
 
				-            # *****END OF YOUR CODE (DO NOT DELETE/MODIFY THIS LINE)*****
			
 
				-            ########################################################################
			
 
				-            #                             END OF YOUR CODE                         #
			
 
				-            ########################################################################
			
 
				-
			
 
				             # Undo the random jitter
			
 
				-            img.data.copy_(jitter(img.data, -ox, -oy))
			
 
				+            img.data.copy_(self.jitter(img.data, -ox, -oy))
			
 
				 
			
 
				             # As regularizer, clamp and periodically blur the image
			
 
				             for c in range(3):
			
 
				-                lo = float(-SQUEEZENET_MEAN[c] / SQUEEZENET_STD[c])
			
 
				-                hi = float((1.0 - SQUEEZENET_MEAN[c]) / SQUEEZENET_STD[c])
			
 
				+                lo = float(-params.mean[c] / params.std[c])
			
 
				+                hi = float((1.0 - params.mean[c]) / params.std[c])
			
 
				                 img.data[:, c].clamp_(min=lo, max=hi)
			
 
				             if t % blur_every == 0:
			
 
				-                blur_image(img.data, sigma=0.5)
			
 
				+                self._blur_image(img.data, sigma=0.5)
			
 
				 
			
 
				             # Periodically show the image
			
 
				             if t == 0 or (t + 1) % show_every == 0 or t == num_iterations - 1:
			
 
				                 plt.imshow(deprocess(img.data.clone().cpu()))
			
 
				-                class_name = class_names[target_y]
			
 
				+                class_name = self.class_names[target_y]
			
 
				                 plt.title('%s\nIteration %d / %d' % (class_name, t + 1, num_iterations))
			
 
				                 plt.gcf().set_size_inches(4, 4)
			
 
				                 plt.axis('off')
			
--- a/requirements.txt
+++ b/requirements.txt
@@ -10,3 +10,4 @@ joblib
 
				 pandas
			
 
				 matplotlib
			
 
				 pytube
			
 
				+scipy
			
--- a/spatial_transforms.py
+++ b/spatial_transforms.py
@@ -1,175 +0,0 @@
 
				-import random
			
 
				-import math
			
 
				-import numbers
			
 
				-import collections
			
 
				-import numpy as np
			
 
				-import torch
			
 
				-from PIL import Image, ImageOps
			
 
				-try:
			
 
				-    import accimage
			
 
				-except ImportError:
			
 
				-    accimage = None
			
 
				-
			
 
				-
			
 
				-class Compose(object):
			
 
				-    """Composes several transforms together.
			
 
				-    Args:
			
 
				-        transforms (list of ``Transform`` objects): list of transforms to compose.
			
 
				-    Example:
			
 
				-        >>> transforms.Compose([
			
 
				-        >>>     transforms.CenterCrop(10),
			
 
				-        >>>     transforms.ToTensor(),
			
 
				-        >>> ])
			
 
				-    """
			
 
				-
			
 
				-    def __init__(self, transforms):
			
 
				-        self.transforms = transforms
			
 
				-
			
 
				-    def __call__(self, img):
			
 
				-        for t in self.transforms:
			
 
				-            img = t(img)
			
 
				-        return img
			
 
				-
			
 
				-
			
 
				-class ToTensor(object):
			
 
				-    """Convert a ``PIL.Image`` or ``numpy.ndarray`` to tensor.
			
 
				-    Converts a PIL.Image or numpy.ndarray (H x W x C) in the range
			
 
				-    [0, 255] to a torch.FloatTensor of shape (C x H x W) in the range [0.0, 1.0].
			
 
				-    """
			
 
				-
			
 
				-    def __call__(self, pic):
			
 
				-        """
			
 
				-        Args:
			
 
				-            pic (PIL.Image or numpy.ndarray): Image to be converted to tensor.
			
 
				-        Returns:
			
 
				-            Tensor: Converted image.
			
 
				-        """
			
 
				-        if isinstance(pic, np.ndarray):
			
 
				-            # handle numpy array
			
 
				-            img = torch.from_numpy(pic.transpose((2, 0, 1)))
			
 
				-            # backward compatibility
			
 
				-            return img.float()
			
 
				-
			
 
				-        if accimage is not None and isinstance(pic, accimage.Image):
			
 
				-            nppic = np.zeros([pic.channels, pic.height, pic.width], dtype=np.float32)
			
 
				-            pic.copyto(nppic)
			
 
				-            return torch.from_numpy(nppic)
			
 
				-
			
 
				-        # handle PIL Image
			
 
				-        if pic.mode == 'I':
			
 
				-            img = torch.from_numpy(np.array(pic, np.int32, copy=False))
			
 
				-        elif pic.mode == 'I;16':
			
 
				-            img = torch.from_numpy(np.array(pic, np.int16, copy=False))
			
 
				-        else:
			
 
				-            img = torch.ByteTensor(torch.ByteStorage.from_buffer(pic.tobytes()))
			
 
				-        # PIL image mode: 1, L, P, I, F, RGB, YCbCr, RGBA, CMYK
			
 
				-        if pic.mode == 'YCbCr':
			
 
				-            nchannel = 3
			
 
				-        elif pic.mode == 'I;16':
			
 
				-            nchannel = 1
			
 
				-        else:
			
 
				-            nchannel = len(pic.mode)
			
 
				-        img = img.view(pic.size[1], pic.size[0], nchannel)
			
 
				-        # put it from HWC to CHW format
			
 
				-        # yikes, this transpose takes 80% of the loading time/CPU
			
 
				-        img = img.transpose(0, 1).transpose(0, 2).contiguous()
			
 
				-        if isinstance(img, torch.ByteTensor):
			
 
				-            return img.float()
			
 
				-        else:
			
 
				-            return img
			
 
				-
			
 
				-
			
 
				-class Normalize(object):
			
 
				-    """Normalize an tensor image with mean and standard deviation.
			
 
				-    Given mean: (R, G, B) and std: (R, G, B),
			
 
				-    will normalize each channel of the torch.*Tensor, i.e.
			
 
				-    channel = (channel - mean) / std
			
 
				-    Args:
			
 
				-        mean (sequence): Sequence of means for R, G, B channels respecitvely.
			
 
				-        std (sequence): Sequence of standard deviations for R, G, B channels
			
 
				-            respecitvely.
			
 
				-    """
			
 
				-
			
 
				-    def __init__(self, mean, std):
			
 
				-        self.mean = mean
			
 
				-        self.std = std
			
 
				-
			
 
				-    def __call__(self, tensor):
			
 
				-        """
			
 
				-        Args:
			
 
				-            tensor (Tensor): Tensor image of size (C, H, W) to be normalized.
			
 
				-        Returns:
			
 
				-            Tensor: Normalized image.
			
 
				-        """
			
 
				-        # TODO: make efficient
			
 
				-        for t, m, s in zip(tensor, self.mean, self.std):
			
 
				-            t.sub_(m).div_(s)
			
 
				-        return tensor
			
 
				-
			
 
				-
			
 
				-class Scale(object):
			
 
				-    """Rescale the input PIL.Image to the given size.
			
 
				-    Args:
			
 
				-        size (sequence or int): Desired output size. If size is a sequence like
			
 
				-            (w, h), output size will be matched to this. If size is an int,
			
 
				-            smaller edge of the image will be matched to this number.
			
 
				-            i.e, if height > width, then image will be rescaled to
			
 
				-            (size * height / width, size)
			
 
				-        interpolation (int, optional): Desired interpolation. Default is
			
 
				-            ``PIL.Image.BILINEAR``
			
 
				-    """
			
 
				-
			
 
				-    def __init__(self, size, interpolation=Image.BILINEAR):
			
 
				-        assert isinstance(size, int) or (isinstance(size, collections.Iterable) and len(size) == 2)
			
 
				-        self.size = size
			
 
				-        self.interpolation = interpolation
			
 
				-
			
 
				-    def __call__(self, img):
			
 
				-        """
			
 
				-        Args:
			
 
				-            img (PIL.Image): Image to be scaled.
			
 
				-        Returns:
			
 
				-            PIL.Image: Rescaled image.
			
 
				-        """
			
 
				-        if isinstance(self.size, int):
			
 
				-            w, h = img.size
			
 
				-            if (w <= h and w == self.size) or (h <= w and h == self.size):
			
 
				-                return img
			
 
				-            if w < h:
			
 
				-                ow = self.size
			
 
				-                oh = int(self.size * h / w)
			
 
				-                return img.resize((ow, oh), self.interpolation)
			
 
				-            else:
			
 
				-                oh = self.size
			
 
				-                ow = int(self.size * w / h)
			
 
				-                return img.resize((ow, oh), self.interpolation)
			
 
				-        else:
			
 
				-            return img.resize(self.size, self.interpolation)
			
 
				-
			
 
				-
			
 
				-class CenterCrop(object):
			
 
				-    """Crops the given PIL.Image at the center.
			
 
				-    Args:
			
 
				-        size (sequence or int): Desired output size of the crop. If size is an
			
 
				-            int instead of sequence like (h, w), a square crop (size, size) is
			
 
				-            made.
			
 
				-    """
			
 
				-
			
 
				-    def __init__(self, size):
			
 
				-        if isinstance(size, numbers.Number):
			
 
				-            self.size = (int(size), int(size))
			
 
				-        else:
			
 
				-            self.size = size
			
 
				-
			
 
				-    def __call__(self, img):
			
 
				-        """
			
 
				-        Args:
			
 
				-            img (PIL.Image): Image to be cropped.
			
 
				-        Returns:
			
 
				-            PIL.Image: Cropped image.
			
 
				-        """
			
 
				-        w, h = img.size
			
 
				-        th, tw = self.size
			
 
				-        x1 = int(round((w - tw) / 2.))
			
 
				-        y1 = int(round((h - th) / 2.))
			
 
				-        return img.crop((x1, y1, x1 + tw, y1 + th))
			
--- a/temporal_transforms.py
+++ b/temporal_transforms.py
@@ -1,50 +0,0 @@
 
				-import random
			
 
				-import math
			
 
				-
			
 
				-
			
 
				-class LoopPadding(object):
			
 
				-    def __init__(self, size):
			
 
				-        self.size = size
			
 
				-
			
 
				-    def __call__(self, frame_indices):
			
 
				-        out = frame_indices
			
 
				-
			
 
				-        for index in out:
			
 
				-            if len(out) >= self.size:
			
 
				-                break
			
 
				-            out.append(index)
			
 
				-
			
 
				-        return out
			
 
				-
			
 
				-
			
 
				-class TemporalCenterCrop(object):
			
 
				-    """Temporally crop the given frame indices at a center.
			
 
				-    If the number of frames is less than the size,
			
 
				-    loop the indices as many times as necessary to satisfy the size.
			
 
				-    Args:
			
 
				-        size (int): Desired output size of the crop.
			
 
				-    """
			
 
				-
			
 
				-    def __init__(self, size):
			
 
				-        self.size = size
			
 
				-
			
 
				-    def __call__(self, frame_indices):
			
 
				-        """
			
 
				-        Args:
			
 
				-            frame_indices (list): frame indices to be cropped.
			
 
				-        Returns:
			
 
				-            list: Cropped frame indices.
			
 
				-        """
			
 
				-
			
 
				-        center_index = len(frame_indices) // 2
			
 
				-        begin_index = max(0, center_index - (self.size // 2))
			
 
				-        end_index = min(begin_index + self.size, len(frame_indices))
			
 
				-
			
 
				-        out = frame_indices[begin_index:end_index]
			
 
				-
			
 
				-        for index in out:
			
 
				-            if len(out) >= self.size:
			
 
				-                break
			
 
				-            out.append(index)
			
 
				-
			
 
				-        return out
			
--- a/train.py
+++ b/train.py
@@ -1,3 +1,4 @@
 
				+# adapted from PyTorch tutorials
			
 
				 import copy
			
 
				 import time
			
 
				 from typing import List, Tuple, Optional