5 years ago · cda3d270f2
--- a/conv3d.py
+++ b/conv3d.py
@@ -0,0 +1,195 @@
 
				+# code is from https://github.com/kenshohara/3D-ResNets-PyTorch/blob/master/model.py
			
 
				+import math
			
 
				+from functools import partial
			
 
				+
			
 
				+import torch
			
 
				+import torch.nn as nn
			
 
				+import torch.nn.functional as F
			
 
				+from torch.autograd import Variable
			
 
				+
			
 
				+
			
 
				+def conv3x3x3(in_planes, out_planes, stride=1):
			
 
				+    # 3x3x3 convolution with padding
			
 
				+    return nn.Conv3d(
			
 
				+        in_planes,
			
 
				+        out_planes,
			
 
				+        kernel_size=3,
			
 
				+        stride=stride,
			
 
				+        padding=1,
			
 
				+        bias=False)
			
 
				+
			
 
				+
			
 
				+def downsample_basic_block(x, planes, stride):
			
 
				+    out = F.avg_pool3d(x, kernel_size=1, stride=stride)
			
 
				+    zero_pads = torch.Tensor(
			
 
				+        out.size(0), planes - out.size(1), out.size(2), out.size(3),
			
 
				+        out.size(4)).zero_()
			
 
				+    if isinstance(out.data, torch.cuda.FloatTensor):
			
 
				+        zero_pads = zero_pads.cuda()
			
 
				+
			
 
				+    out = Variable(torch.cat([out.data, zero_pads], dim=1))
			
 
				+
			
 
				+    return out
			
 
				+
			
 
				+
			
 
				+class BasicBlock(nn.Module):
			
 
				+    expansion = 1
			
 
				+
			
 
				+    def __init__(self, inplanes, planes, stride=1, downsample=None):
			
 
				+        super(BasicBlock, self).__init__()
			
 
				+        self.conv1 = conv3x3x3(inplanes, planes, stride)
			
 
				+        self.bn1 = nn.BatchNorm3d(planes)
			
 
				+        self.relu = nn.ReLU(inplace=True)
			
 
				+        self.conv2 = conv3x3x3(planes, planes)
			
 
				+        self.bn2 = nn.BatchNorm3d(planes)
			
 
				+        self.downsample = downsample
			
 
				+        self.stride = stride
			
 
				+
			
 
				+    def forward(self, x):
			
 
				+        residual = x
			
 
				+
			
 
				+        out = self.conv1(x)
			
 
				+        out = self.bn1(out)
			
 
				+        out = self.relu(out)
			
 
				+
			
 
				+        out = self.conv2(out)
			
 
				+        out = self.bn2(out)
			
 
				+
			
 
				+        if self.downsample is not None:
			
 
				+            residual = self.downsample(x)
			
 
				+
			
 
				+        out += residual
			
 
				+        out = self.relu(out)
			
 
				+
			
 
				+        return out
			
 
				+
			
 
				+
			
 
				+class Bottleneck(nn.Module):
			
 
				+    expansion = 4
			
 
				+
			
 
				+    def __init__(self, inplanes, planes, stride=1, downsample=None):
			
 
				+        super(Bottleneck, self).__init__()
			
 
				+        self.conv1 = nn.Conv3d(inplanes, planes, kernel_size=1, bias=False)
			
 
				+        self.bn1 = nn.BatchNorm3d(planes)
			
 
				+        self.conv2 = nn.Conv3d(
			
 
				+            planes, planes, kernel_size=3, stride=stride, padding=1, bias=False)
			
 
				+        self.bn2 = nn.BatchNorm3d(planes)
			
 
				+        self.conv3 = nn.Conv3d(planes, planes * 4, kernel_size=1, bias=False)
			
 
				+        self.bn3 = nn.BatchNorm3d(planes * 4)
			
 
				+        self.relu = nn.ReLU(inplace=True)
			
 
				+        self.downsample = downsample
			
 
				+        self.stride = stride
			
 
				+
			
 
				+    def forward(self, x):
			
 
				+        residual = x
			
 
				+
			
 
				+        out = self.conv1(x)
			
 
				+        out = self.bn1(out)
			
 
				+        out = self.relu(out)
			
 
				+
			
 
				+        out = self.conv2(out)
			
 
				+        out = self.bn2(out)
			
 
				+        out = self.relu(out)
			
 
				+
			
 
				+        out = self.conv3(out)
			
 
				+        out = self.bn3(out)
			
 
				+
			
 
				+        if self.downsample is not None:
			
 
				+            residual = self.downsample(x)
			
 
				+
			
 
				+        out += residual
			
 
				+        out = self.relu(out)
			
 
				+
			
 
				+        return out
			
 
				+
			
 
				+
			
 
				+class ResNet(nn.Module):
			
 
				+
			
 
				+    def __init__(self,
			
 
				+                 block,
			
 
				+                 layers,
			
 
				+                 sample_size,
			
 
				+                 sample_duration,
			
 
				+                 shortcut_type='B',
			
 
				+                 num_classes=400):
			
 
				+        self.inplanes = 64
			
 
				+        super(ResNet, self).__init__()
			
 
				+        self.conv1 = nn.Conv3d(
			
 
				+            3,
			
 
				+            64,
			
 
				+            kernel_size=7,
			
 
				+            stride=(1, 2, 2),
			
 
				+            padding=(3, 3, 3),
			
 
				+            bias=False)
			
 
				+        self.bn1 = nn.BatchNorm3d(64)
			
 
				+        self.relu = nn.ReLU(inplace=True)
			
 
				+        self.maxpool = nn.MaxPool3d(kernel_size=(3, 3, 3), stride=2, padding=1)
			
 
				+        self.layer1 = self._make_layer(block, 64, layers[0], shortcut_type)
			
 
				+        self.layer2 = self._make_layer(
			
 
				+            block, 128, layers[1], shortcut_type, stride=2)
			
 
				+        self.layer3 = self._make_layer(
			
 
				+            block, 256, layers[2], shortcut_type, stride=2)
			
 
				+        self.layer4 = self._make_layer(
			
 
				+            block, 512, layers[3], shortcut_type, stride=2)
			
 
				+        last_duration = int(math.ceil(sample_duration / 16))
			
 
				+        last_size = int(math.ceil(sample_size / 32))
			
 
				+        self.avgpool = nn.AvgPool3d(
			
 
				+            (last_duration, last_size, last_size), stride=1)
			
 
				+        self.fc = nn.Linear(512 * block.expansion, num_classes)
			
 
				+
			
 
				+        for m in self.modules():
			
 
				+            if isinstance(m, nn.Conv3d):
			
 
				+                m.weight = nn.init.kaiming_normal(m.weight, mode='fan_out')
			
 
				+            elif isinstance(m, nn.BatchNorm3d):
			
 
				+                m.weight.data.fill_(1)
			
 
				+                m.bias.data.zero_()
			
 
				+
			
 
				+    def _make_layer(self, block, planes, blocks, shortcut_type, stride=1):
			
 
				+        downsample = None
			
 
				+        if stride != 1 or self.inplanes != planes * block.expansion:
			
 
				+            if shortcut_type == 'A':
			
 
				+                downsample = partial(
			
 
				+                    downsample_basic_block,
			
 
				+                    planes=planes * block.expansion,
			
 
				+                    stride=stride)
			
 
				+            else:
			
 
				+                downsample = nn.Sequential(
			
 
				+                    nn.Conv3d(
			
 
				+                        self.inplanes,
			
 
				+                        planes * block.expansion,
			
 
				+                        kernel_size=1,
			
 
				+                        stride=stride,
			
 
				+                        bias=False), nn.BatchNorm3d(planes * block.expansion))
			
 
				+a
			
 
				+        layers = []
			
 
				+        layers.append(block(self.inplanes, planes, stride, downsample))
			
 
				+        self.inplanes = planes * block.expansion
			
 
				+        for i in range(1, blocks):
			
 
				+            layers.append(block(self.inplanes, planes))
			
 
				+
			
 
				+        return nn.Sequential(*layers)
			
 
				+
			
 
				+    def forward(self, x):
			
 
				+        x = self.conv1(x)
			
 
				+        x = self.bn1(x)
			
 
				+        x = self.relu(x)
			
 
				+        x = self.maxpool(x)
			
 
				+
			
 
				+        x = self.layer1(x)
			
 
				+        x = self.layer2(x)
			
 
				+        x = self.layer3(x)
			
 
				+        x = self.layer4(x)
			
 
				+
			
 
				+        x = self.avgpool(x)
			
 
				+
			
 
				+        x = x.view(x.size(0), -1)
			
 
				+        x = self.fc(x)
			
 
				+
			
 
				+        return x
			
 
				+
			
 
				+
			
 
				+def resnet34(**kwargs):
			
 
				+    """Constructs a ResNet-34 model.
			
 
				+    """
			
 
				+    model = ResNet(BasicBlock, [3, 4, 6, 3], **kwargs)
			
 
				+    return model
			
--- a/kissing_detector.py
+++ b/kissing_detector.py
@@ -1,8 +1,11 @@
 
				+from typing import Optional
			
 
				+
			
 
				 import torch
			
 
				 from torch import nn
			
 
				+
			
 
				 import vggish
			
 
				 from conv import convnet_init
			
 
				-from typing import Optional
			
 
				+import conv3d
			
 
				 
			
 
				 
			
 
				 class KissingDetector(nn.Module):
			
@@ -45,3 +48,45 @@ class KissingDetector(nn.Module):
 
				             combined = a if a is not None else c
			
 
				 
			
 
				         return self.combined(combined)
			
 
				+
			
 
				+
			
 
				+class KissingDetector3DConv(nn.Module):
			
 
				+    def __init__(self,
			
 
				+                 num_classes: int,
			
 
				+                 feature_extract: bool,
			
 
				+                 use_vggish: bool = True):
			
 
				+        super(KissingDetector3DConv, self).__init__()
			
 
				+        conv_output_size = 512
			
 
				+        vggish_output_size = 0
			
 
				+        conv_input_size = 0
			
 
				+        vggish_model = None
			
 
				+
			
 
				+        conv = conv3d.resnet34(
			
 
				+            num_classes=num_classes,
			
 
				+            shortcut_type='B',
			
 
				+            sample_size=224,
			
 
				+            sample_duration=10
			
 
				+        )
			
 
				+        conv.fc = nn.Identity()
			
 
				+
			
 
				+        if use_vggish:
			
 
				+            vggish_model, vggish_output_size = vggish.vggish(feature_extract)
			
 
				+
			
 
				+        if not conv and not vggish_model:
			
 
				+            raise ValueError("Use VGGish, Conv, or both")
			
 
				+
			
 
				+        self.conv_input_size = conv_input_size
			
 
				+        self.conv = conv
			
 
				+        self.vggish = vggish_model
			
 
				+        self.combined = nn.Linear(vggish_output_size + conv_output_size, num_classes)
			
 
				+
			
 
				+    def forward(self, audio: torch.Tensor, image: torch.Tensor):
			
 
				+        a = self.vggish(audio) if self.vggish else None
			
 
				+        c = self.conv(image) if self.conv else None
			
 
				+
			
 
				+        if a is not None and c is not None:
			
 
				+            combined = torch.cat((c.view(c.size(0), -1), a.view(a.size(0), -1)), dim=1)
			
 
				+        else:
			
 
				+            combined = a if a is not None else c
			
 
				+
			
 
				+        return self.combined(combined)
			
--- a/qualitative.py
+++ b/qualitative.py
@@ -94,3 +94,101 @@ class QualitativeAnalysis:
 
				             y = [1 if 'kissing' in vid else 0] * len(A)
			
 
				             self._show_saliency_maps(A, I, y)
			
 
				             print('=' * 10)
			
 
				+
			
 
				+    # next few methods taken from cs231n
			
 
				+    @staticmethod
			
 
				+    def jitter(X, ox, oy):
			
 
				+        """
			
 
				+        Helper function to randomly jitter an image.
			
 
				+
			
 
				+        Inputs
			
 
				+        - X: PyTorch Tensor of shape (N, C, H, W)
			
 
				+        - ox, oy: Integers giving number of pixels to jitter along W and H axes
			
 
				+
			
 
				+        Returns: A new PyTorch Tensor of shape (N, C, H, W)
			
 
				+        """
			
 
				+        if ox != 0:
			
 
				+            left = X[:, :, :, :-ox]
			
 
				+            right = X[:, :, :, -ox:]
			
 
				+            X = torch.cat([right, left], dim=3)
			
 
				+        if oy != 0:
			
 
				+            top = X[:, :, :-oy]
			
 
				+            bottom = X[:, :, -oy:]
			
 
				+            X = torch.cat([bottom, top], dim=2)
			
 
				+        return X
			
 
				+
			
 
				+    def create_class_visualization(target_y, model, dtype, **kwargs):
			
 
				+        """
			
 
				+        Generate an image to maximize the score of target_y under a pretrained model.
			
 
				+
			
 
				+        Inputs:
			
 
				+        - target_y: Integer in the range [0, 1000) giving the index of the class
			
 
				+        - model: A pretrained CNN that will be used to generate the image
			
 
				+        - dtype: Torch datatype to use for computations
			
 
				+
			
 
				+        Keyword arguments:
			
 
				+        - l2_reg: Strength of L2 regularization on the image
			
 
				+        - learning_rate: How big of a step to take
			
 
				+        - num_iterations: How many iterations to use
			
 
				+        - blur_every: How often to blur the image as an implicit regularizer
			
 
				+        - max_jitter: How much to gjitter the image as an implicit regularizer
			
 
				+        - show_every: How often to show the intermediate result
			
 
				+        """
			
 
				+        model.type(dtype)
			
 
				+        l2_reg = kwargs.pop('l2_reg', 1e-3)
			
 
				+        learning_rate = kwargs.pop('learning_rate', 25)
			
 
				+        num_iterations = kwargs.pop('num_iterations', 100)
			
 
				+        blur_every = kwargs.pop('blur_every', 10)
			
 
				+        max_jitter = kwargs.pop('max_jitter', 16)
			
 
				+        show_every = kwargs.pop('show_every', 25)
			
 
				+
			
 
				+        # Randomly initialize the image as a PyTorch Tensor, and make it requires gradient.
			
 
				+        img = torch.randn(1, 3, 224, 224).mul_(1.0).type(dtype).requires_grad_()
			
 
				+
			
 
				+        for t in range(num_iterations):
			
 
				+            # Randomly jitter the image a bit; this gives slightly nicer results
			
 
				+            ox, oy = random.randint(0, max_jitter), random.randint(0, max_jitter)
			
 
				+            img.data.copy_(jitter(img.data, ox, oy))
			
 
				+
			
 
				+            ########################################################################
			
 
				+            # TODO: Use the model to compute the gradient of the score for the     #
			
 
				+            # class target_y with respect to the pixels of the image, and make a   #
			
 
				+            # gradient step on the image using the learning rate. Don't forget the #
			
 
				+            # L2 regularization term!                                              #
			
 
				+            # Be very careful about the signs of elements in your code.            #
			
 
				+            ########################################################################
			
 
				+            # *****START OF YOUR CODE (DO NOT DELETE/MODIFY THIS LINE)*****
			
 
				+
			
 
				+            target = model(img)[0, target_y]
			
 
				+            target.backward()
			
 
				+            g = img.grad.data
			
 
				+            g -= 2 * l2_reg * img.data
			
 
				+            img.data += learning_rate * (g / g.norm())
			
 
				+            img.grad.zero_()
			
 
				+
			
 
				+            # *****END OF YOUR CODE (DO NOT DELETE/MODIFY THIS LINE)*****
			
 
				+            ########################################################################
			
 
				+            #                             END OF YOUR CODE                         #
			
 
				+            ########################################################################
			
 
				+
			
 
				+            # Undo the random jitter
			
 
				+            img.data.copy_(jitter(img.data, -ox, -oy))
			
 
				+
			
 
				+            # As regularizer, clamp and periodically blur the image
			
 
				+            for c in range(3):
			
 
				+                lo = float(-SQUEEZENET_MEAN[c] / SQUEEZENET_STD[c])
			
 
				+                hi = float((1.0 - SQUEEZENET_MEAN[c]) / SQUEEZENET_STD[c])
			
 
				+                img.data[:, c].clamp_(min=lo, max=hi)
			
 
				+            if t % blur_every == 0:
			
 
				+                blur_image(img.data, sigma=0.5)
			
 
				+
			
 
				+            # Periodically show the image
			
 
				+            if t == 0 or (t + 1) % show_every == 0 or t == num_iterations - 1:
			
 
				+                plt.imshow(deprocess(img.data.clone().cpu()))
			
 
				+                class_name = class_names[target_y]
			
 
				+                plt.title('%s\nIteration %d / %d' % (class_name, t + 1, num_iterations))
			
 
				+                plt.gcf().set_size_inches(4, 4)
			
 
				+                plt.axis('off')
			
 
				+                plt.show()
			
 
				+
			
 
				+        return deprocess(img.data.cpu())