Amir Ziai 4 years ago
parent
commit
98455f0e8f
14 changed files with 43 additions and 275 deletions
  1. 12 24
      README.md
  2. 1 0
      conv.py
  3. 0 0
      dev/dev4.ipynb
  4. 0 0
      dev/dev5.ipynb
  5. 0 0
      dev/dev6.ipynb
  6. 0 0
      dev/dev7.ipynb
  7. 0 0
      dev/dev8.ipynb
  8. 1 1
      experiments.py
  9. 1 1
      params.py
  10. 26 24
      qualitative.py
  11. 1 0
      requirements.txt
  12. 0 175
      spatial_transforms.py
  13. 0 50
      temporal_transforms.py
  14. 1 0
      train.py

+ 12 - 24
README.md

@@ -3,7 +3,16 @@ Detect kissing scenes in a movie using both audio and video features.
 
 Project for [Stanford CS231N](http://cs231n.stanford.edu)
 
+## Running the code
+Use Python 3.6+
+```bash
+python3 experiments.py
+```
+
+this will run the experiments in `params.py` specified by the `experiments` dictionary.
+
 ## Build dataset
+The following will build the dataset for training. You need to provide path to video segments.
 ```python
 from pipeline import BuildDataset
 
@@ -31,27 +40,6 @@ builder = BuildDataset(base_path='path/to/movies',
 builder.build_dataset()
 ```
 
-### Data loader
-
-
-## Explorations:
-- ConvNet, VGGish, or both
-- ConvNet architectures: ResNet, VGG, AlexNet, SqueezeNet, DenseNet
-- With and without pre-training
-- 
-- (3DC) 
-
-## Diagnostics
-- Saliency maps
-- Class viz
-- Confusion matrices
-- Detected segments
-- Failure examples
-
-## TODO
-- Qual
-    - Saliency map
-    - class viz
-    - Error examples
-    - Audio?
-- 3DC
+## Heavily used the following resources:
+- [Video Classification Using 3D ResNet](https://github.com/kenshohara/video-classification-3d-cnn-pytorch)
+- [CS231N assignment 3](http://cs231n.github.io/assignments2019/assignment3/)

+ 1 - 0
conv.py

@@ -1,3 +1,4 @@
+# adapted from PyTorch tutorials
 import torch
 from torch import nn
 from torchvision import models

+ 0 - 0
dev4.ipynb → dev/dev4.ipynb


+ 0 - 0
dev5.ipynb → dev/dev5.ipynb


+ 0 - 0
dev6.ipynb → dev/dev6.ipynb


+ 0 - 0
dev7.ipynb → dev/dev7.ipynb


+ 0 - 0
dev8.ipynb → dev/dev8.ipynb


+ 1 - 1
experiments.py

@@ -70,5 +70,5 @@ class ExperimentRunner:
 
 
 if __name__ == '__main__':
-    experiment1 = ExperimentRunner(params.experiment1_test, n_jobs=params.n_jobs)
+    experiment1 = ExperimentRunner(params.experiments, n_jobs=params.n_jobs)
     experiment1.run()

+ 1 - 1
params.py

@@ -34,7 +34,7 @@ experiment_test_3d = {
 
 experiments = {
     'data_path_base': {data_path_base},
-    'conv_model_name': {'resnet', None},  # vgg
+    'conv_model_name': {'resnet', None, 'vgg', 'densenet', 'squeezenet'},
     'num_epochs': {10},
     'feature_extract': {True, False},
     'batch_size': {64},

+ 26 - 24
qualitative.py

@@ -1,11 +1,15 @@
+# adapted from http://cs231n.github.io/assignments2019/assignment3/
+import random
 from typing import Dict, List
 
 import matplotlib.pyplot as plt
+import numpy as np
 import torch
 from PIL import Image
+from scipy.ndimage.filters import gaussian_filter1d
 from torch import nn
-import numpy as np
 
+import params
 from pipeline import BuildDataset
 
 
@@ -117,7 +121,15 @@ class QualitativeAnalysis:
             X = torch.cat([bottom, top], dim=2)
         return X
 
-    def create_class_visualization(target_y, model, dtype, **kwargs):
+    @staticmethod
+    def _blur_image(X, sigma=1):
+        X_np = X.cpu().clone().numpy()
+        X_np = gaussian_filter1d(X_np, sigma, axis=2)
+        X_np = gaussian_filter1d(X_np, sigma, axis=3)
+        X.copy_(torch.Tensor(X_np).type_as(X))
+        return X
+
+    def create_class_visualization(self, target_y, model, dtype, a, **kwargs):
         """
         Generate an image to maximize the score of target_y under a pretrained model.
 
@@ -134,6 +146,10 @@ class QualitativeAnalysis:
         - max_jitter: How much to gjitter the image as an implicit regularizer
         - show_every: How often to show the intermediate result
         """
+
+        def deprocess(x):
+            return BuildDataset.transform_reverse(x.squeeze(0))
+
         model.type(dtype)
         l2_reg = kwargs.pop('l2_reg', 1e-3)
         learning_rate = kwargs.pop('learning_rate', 25)
@@ -148,44 +164,30 @@ class QualitativeAnalysis:
         for t in range(num_iterations):
             # Randomly jitter the image a bit; this gives slightly nicer results
             ox, oy = random.randint(0, max_jitter), random.randint(0, max_jitter)
-            img.data.copy_(jitter(img.data, ox, oy))
-
-            ########################################################################
-            # TODO: Use the model to compute the gradient of the score for the     #
-            # class target_y with respect to the pixels of the image, and make a   #
-            # gradient step on the image using the learning rate. Don't forget the #
-            # L2 regularization term!                                              #
-            # Be very careful about the signs of elements in your code.            #
-            ########################################################################
-            # *****START OF YOUR CODE (DO NOT DELETE/MODIFY THIS LINE)*****
-
-            target = model(img)[0, target_y]
+            img.data.copy_(self.jitter(img.data, ox, oy))
+
+            target = model(a, img)[0, target_y]
             target.backward()
             g = img.grad.data
             g -= 2 * l2_reg * img.data
             img.data += learning_rate * (g / g.norm())
             img.grad.zero_()
 
-            # *****END OF YOUR CODE (DO NOT DELETE/MODIFY THIS LINE)*****
-            ########################################################################
-            #                             END OF YOUR CODE                         #
-            ########################################################################
-
             # Undo the random jitter
-            img.data.copy_(jitter(img.data, -ox, -oy))
+            img.data.copy_(self.jitter(img.data, -ox, -oy))
 
             # As regularizer, clamp and periodically blur the image
             for c in range(3):
-                lo = float(-SQUEEZENET_MEAN[c] / SQUEEZENET_STD[c])
-                hi = float((1.0 - SQUEEZENET_MEAN[c]) / SQUEEZENET_STD[c])
+                lo = float(-params.mean[c] / params.std[c])
+                hi = float((1.0 - params.mean[c]) / params.std[c])
                 img.data[:, c].clamp_(min=lo, max=hi)
             if t % blur_every == 0:
-                blur_image(img.data, sigma=0.5)
+                self._blur_image(img.data, sigma=0.5)
 
             # Periodically show the image
             if t == 0 or (t + 1) % show_every == 0 or t == num_iterations - 1:
                 plt.imshow(deprocess(img.data.clone().cpu()))
-                class_name = class_names[target_y]
+                class_name = self.class_names[target_y]
                 plt.title('%s\nIteration %d / %d' % (class_name, t + 1, num_iterations))
                 plt.gcf().set_size_inches(4, 4)
                 plt.axis('off')

+ 1 - 0
requirements.txt

@@ -10,3 +10,4 @@ joblib
 pandas
 matplotlib
 pytube
+scipy

+ 0 - 175
spatial_transforms.py

@@ -1,175 +0,0 @@
-import random
-import math
-import numbers
-import collections
-import numpy as np
-import torch
-from PIL import Image, ImageOps
-try:
-    import accimage
-except ImportError:
-    accimage = None
-
-
-class Compose(object):
-    """Composes several transforms together.
-    Args:
-        transforms (list of ``Transform`` objects): list of transforms to compose.
-    Example:
-        >>> transforms.Compose([
-        >>>     transforms.CenterCrop(10),
-        >>>     transforms.ToTensor(),
-        >>> ])
-    """
-
-    def __init__(self, transforms):
-        self.transforms = transforms
-
-    def __call__(self, img):
-        for t in self.transforms:
-            img = t(img)
-        return img
-
-
-class ToTensor(object):
-    """Convert a ``PIL.Image`` or ``numpy.ndarray`` to tensor.
-    Converts a PIL.Image or numpy.ndarray (H x W x C) in the range
-    [0, 255] to a torch.FloatTensor of shape (C x H x W) in the range [0.0, 1.0].
-    """
-
-    def __call__(self, pic):
-        """
-        Args:
-            pic (PIL.Image or numpy.ndarray): Image to be converted to tensor.
-        Returns:
-            Tensor: Converted image.
-        """
-        if isinstance(pic, np.ndarray):
-            # handle numpy array
-            img = torch.from_numpy(pic.transpose((2, 0, 1)))
-            # backward compatibility
-            return img.float()
-
-        if accimage is not None and isinstance(pic, accimage.Image):
-            nppic = np.zeros([pic.channels, pic.height, pic.width], dtype=np.float32)
-            pic.copyto(nppic)
-            return torch.from_numpy(nppic)
-
-        # handle PIL Image
-        if pic.mode == 'I':
-            img = torch.from_numpy(np.array(pic, np.int32, copy=False))
-        elif pic.mode == 'I;16':
-            img = torch.from_numpy(np.array(pic, np.int16, copy=False))
-        else:
-            img = torch.ByteTensor(torch.ByteStorage.from_buffer(pic.tobytes()))
-        # PIL image mode: 1, L, P, I, F, RGB, YCbCr, RGBA, CMYK
-        if pic.mode == 'YCbCr':
-            nchannel = 3
-        elif pic.mode == 'I;16':
-            nchannel = 1
-        else:
-            nchannel = len(pic.mode)
-        img = img.view(pic.size[1], pic.size[0], nchannel)
-        # put it from HWC to CHW format
-        # yikes, this transpose takes 80% of the loading time/CPU
-        img = img.transpose(0, 1).transpose(0, 2).contiguous()
-        if isinstance(img, torch.ByteTensor):
-            return img.float()
-        else:
-            return img
-
-
-class Normalize(object):
-    """Normalize an tensor image with mean and standard deviation.
-    Given mean: (R, G, B) and std: (R, G, B),
-    will normalize each channel of the torch.*Tensor, i.e.
-    channel = (channel - mean) / std
-    Args:
-        mean (sequence): Sequence of means for R, G, B channels respecitvely.
-        std (sequence): Sequence of standard deviations for R, G, B channels
-            respecitvely.
-    """
-
-    def __init__(self, mean, std):
-        self.mean = mean
-        self.std = std
-
-    def __call__(self, tensor):
-        """
-        Args:
-            tensor (Tensor): Tensor image of size (C, H, W) to be normalized.
-        Returns:
-            Tensor: Normalized image.
-        """
-        # TODO: make efficient
-        for t, m, s in zip(tensor, self.mean, self.std):
-            t.sub_(m).div_(s)
-        return tensor
-
-
-class Scale(object):
-    """Rescale the input PIL.Image to the given size.
-    Args:
-        size (sequence or int): Desired output size. If size is a sequence like
-            (w, h), output size will be matched to this. If size is an int,
-            smaller edge of the image will be matched to this number.
-            i.e, if height > width, then image will be rescaled to
-            (size * height / width, size)
-        interpolation (int, optional): Desired interpolation. Default is
-            ``PIL.Image.BILINEAR``
-    """
-
-    def __init__(self, size, interpolation=Image.BILINEAR):
-        assert isinstance(size, int) or (isinstance(size, collections.Iterable) and len(size) == 2)
-        self.size = size
-        self.interpolation = interpolation
-
-    def __call__(self, img):
-        """
-        Args:
-            img (PIL.Image): Image to be scaled.
-        Returns:
-            PIL.Image: Rescaled image.
-        """
-        if isinstance(self.size, int):
-            w, h = img.size
-            if (w <= h and w == self.size) or (h <= w and h == self.size):
-                return img
-            if w < h:
-                ow = self.size
-                oh = int(self.size * h / w)
-                return img.resize((ow, oh), self.interpolation)
-            else:
-                oh = self.size
-                ow = int(self.size * w / h)
-                return img.resize((ow, oh), self.interpolation)
-        else:
-            return img.resize(self.size, self.interpolation)
-
-
-class CenterCrop(object):
-    """Crops the given PIL.Image at the center.
-    Args:
-        size (sequence or int): Desired output size of the crop. If size is an
-            int instead of sequence like (h, w), a square crop (size, size) is
-            made.
-    """
-
-    def __init__(self, size):
-        if isinstance(size, numbers.Number):
-            self.size = (int(size), int(size))
-        else:
-            self.size = size
-
-    def __call__(self, img):
-        """
-        Args:
-            img (PIL.Image): Image to be cropped.
-        Returns:
-            PIL.Image: Cropped image.
-        """
-        w, h = img.size
-        th, tw = self.size
-        x1 = int(round((w - tw) / 2.))
-        y1 = int(round((h - th) / 2.))
-        return img.crop((x1, y1, x1 + tw, y1 + th))

+ 0 - 50
temporal_transforms.py

@@ -1,50 +0,0 @@
-import random
-import math
-
-
-class LoopPadding(object):
-    def __init__(self, size):
-        self.size = size
-
-    def __call__(self, frame_indices):
-        out = frame_indices
-
-        for index in out:
-            if len(out) >= self.size:
-                break
-            out.append(index)
-
-        return out
-
-
-class TemporalCenterCrop(object):
-    """Temporally crop the given frame indices at a center.
-    If the number of frames is less than the size,
-    loop the indices as many times as necessary to satisfy the size.
-    Args:
-        size (int): Desired output size of the crop.
-    """
-
-    def __init__(self, size):
-        self.size = size
-
-    def __call__(self, frame_indices):
-        """
-        Args:
-            frame_indices (list): frame indices to be cropped.
-        Returns:
-            list: Cropped frame indices.
-        """
-
-        center_index = len(frame_indices) // 2
-        begin_index = max(0, center_index - (self.size // 2))
-        end_index = min(begin_index + self.size, len(frame_indices))
-
-        out = frame_indices[begin_index:end_index]
-
-        for index in out:
-            if len(out) >= self.size:
-                break
-            out.append(index)
-
-        return out

+ 1 - 0
train.py

@@ -1,3 +1,4 @@
+# adapted from PyTorch tutorials
 import copy
 import time
 from typing import List, Tuple, Optional