6 years ago · c70e64d6cd
--- a/data.py
+++ b/data.py
@@ -1,12 +1,14 @@
 
				 import copy
			
 
				 import functools
			
 
				+import json
			
 
				 import os
			
 
				+import pickle
			
 
				+from glob import glob
			
 
				+from typing import Tuple
			
 
				 
			
 
				 import torch
			
 
				 import torch.utils.data as data
			
 
				 from PIL import Image
			
 
				-# import accimage
			
 
				-import json
			
 
				 
			
 
				 
			
 
				 def pil_loader(path):
			
@@ -137,3 +139,24 @@ class Video(data.Dataset):
 
				 
			
 
				     def __len__(self):
			
 
				         return len(self.data)
			
 
				+
			
 
				+
			
 
				+class AudioVideo(data.Dataset):
			
 
				+    def __init__(self, path: str):
			
 
				+        self.path = path
			
 
				+        self.data = []
			
 
				+
			
 
				+        for file_path in glob(f'{path}/*.pkl'):
			
 
				+            audios, images, label = pickle.load(open(file_path, 'rb'))
			
 
				+            self.data += [(audios[i], images[i], label) for i in range(len(audios))]
			
 
				+
			
 
				+    def __len__(self):
			
 
				+        return len(self.data)
			
 
				+
			
 
				+    def __getitem__(self, idx: int) -> Tuple[torch.Tensor, torch.Tensor, int]:
			
 
				+        # return (
			
 
				+        #     torch.rand((1, 96, 64)),
			
 
				+        #     torch.rand((3, 224, 224)),
			
 
				+        #     np.random.choice([0, 1])
			
 
				+        # )
			
 
				+        return self.data[idx]
			
--- a/dev.ipynb
+++ b/dev.ipynb
@@ -944,6 +944,33 @@
 
				     "kd"
			
 
				    ]
			
 
				   },
			
 
				+  {
			
 
				+   "cell_type": "code",
			
 
				+   "execution_count": 114,
			
 
				+   "metadata": {},
			
 
				+   "outputs": [],
			
 
				+   "source": [
			
 
				+    "from data import AudioVideo"
			
 
				+   ]
			
 
				+  },
			
 
				+  {
			
 
				+   "cell_type": "code",
			
 
				+   "execution_count": 115,
			
 
				+   "metadata": {},
			
 
				+   "outputs": [],
			
 
				+   "source": [
			
 
				+    "av = AudioVideo('x')"
			
 
				+   ]
			
 
				+  },
			
 
				+  {
			
 
				+   "cell_type": "code",
			
 
				+   "execution_count": 118,
			
 
				+   "metadata": {},
			
 
				+   "outputs": [],
			
 
				+   "source": [
			
 
				+    "a, v, c = av[0]"
			
 
				+   ]
			
 
				+  },
			
 
				   {
			
 
				    "cell_type": "code",
			
 
				    "execution_count": 7,
			
@@ -961,7 +988,116 @@
 
				     }
			
 
				    ],
			
 
				    "source": [
			
 
				-    "kd(torch.rand((1, 1, 96, 64)), torch.rand((1, 3, kd.conv_input_size, kd.conv_input_size)))"
			
 
				+    "kd(torch.rand((1, 1, 96, 64)),\n",
			
 
				+    "   torch.rand((1, 3, kd.conv_input_size, kd.conv_input_size)))"
			
 
				+   ]
			
 
				+  },
			
 
				+  {
			
 
				+   "cell_type": "code",
			
 
				+   "execution_count": 120,
			
 
				+   "metadata": {},
			
 
				+   "outputs": [],
			
 
				+   "source": [
			
 
				+    "datasets = {x: AudioVideo(x) for x in ['train', 'val']}"
			
 
				+   ]
			
 
				+  },
			
 
				+  {
			
 
				+   "cell_type": "code",
			
 
				+   "execution_count": 121,
			
 
				+   "metadata": {},
			
 
				+   "outputs": [],
			
 
				+   "source": [
			
 
				+    "dataloaders_dict = {x: torch.utils.data.DataLoader(datasets[x],\n",
			
 
				+    "                                                   batch_size=batch_size,\n",
			
 
				+    "                                                   shuffle=True, num_workers=4)\n",
			
 
				+    "                    for x in ['train', 'val']}"
			
 
				+   ]
			
 
				+  },
			
 
				+  {
			
 
				+   "cell_type": "code",
			
 
				+   "execution_count": 122,
			
 
				+   "metadata": {},
			
 
				+   "outputs": [
			
 
				+    {
			
 
				+     "name": "stdout",
			
 
				+     "output_type": "stream",
			
 
				+     "text": [
			
 
				+      "Epoch 0/14\n",
			
 
				+      "----------\n"
			
 
				+     ]
			
 
				+    },
			
 
				+    {
			
 
				+     "ename": "ValueError",
			
 
				+     "evalue": "too many values to unpack (expected 2)",
			
 
				+     "output_type": "error",
			
 
				+     "traceback": [
			
 
				+      "\u001b[0;31m---------------------------------------------------------------------------\u001b[0m",
			
 
				+      "\u001b[0;31mValueError\u001b[0m                                Traceback (most recent call last)",
			
 
				+      "\u001b[0;32m<ipython-input-122-89a52f4174d6>\u001b[0m in \u001b[0;36m<module>\u001b[0;34m\u001b[0m\n\u001b[1;32m      1\u001b[0m model_ft, hist = train_model(kd,\n\u001b[1;32m      2\u001b[0m                              \u001b[0mdataloaders_dict\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mcriterion\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0moptimizer_ft\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mnum_epochs\u001b[0m\u001b[0;34m=\u001b[0m\u001b[0mnum_epochs\u001b[0m\u001b[0;34m,\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m----> 3\u001b[0;31m                              is_inception=(model_name == \"inception\"))\n\u001b[0m",
			
 
				+      "\u001b[0;32m<ipython-input-68-15dc3f5706f2>\u001b[0m in \u001b[0;36mtrain_model\u001b[0;34m(model, dataloaders, criterion, optimizer, num_epochs, is_inception)\u001b[0m\n\u001b[1;32m     48\u001b[0m \u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m     49\u001b[0m             \u001b[0;31m# Iterate over data.\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m---> 50\u001b[0;31m             \u001b[0;32mfor\u001b[0m \u001b[0minputs\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mlabels\u001b[0m \u001b[0;32min\u001b[0m \u001b[0mdataloaders\u001b[0m\u001b[0;34m[\u001b[0m\u001b[0mphase\u001b[0m\u001b[0;34m]\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m\u001b[1;32m     51\u001b[0m                 \u001b[0minputs\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0minputs\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mto\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mdevice\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m     52\u001b[0m                 \u001b[0mlabels\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0mlabels\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mto\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mdevice\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n",
			
 
				+      "\u001b[0;31mValueError\u001b[0m: too many values to unpack (expected 2)"
			
 
				+     ]
			
 
				+    }
			
 
				+   ],
			
 
				+   "source": [
			
 
				+    "model_ft, hist = train_model(kd,\n",
			
 
				+    "                             dataloaders_dict, criterion, optimizer_ft, num_epochs=num_epochs,\n",
			
 
				+    "                             is_inception=(model_name == \"inception\"))\n"
			
 
				+   ]
			
 
				+  },
			
 
				+  {
			
 
				+   "cell_type": "code",
			
 
				+   "execution_count": null,
			
 
				+   "metadata": {},
			
 
				+   "outputs": [],
			
 
				+   "source": []
			
 
				+  },
			
 
				+  {
			
 
				+   "cell_type": "code",
			
 
				+   "execution_count": null,
			
 
				+   "metadata": {},
			
 
				+   "outputs": [],
			
 
				+   "source": []
			
 
				+  },
			
 
				+  {
			
 
				+   "cell_type": "code",
			
 
				+   "execution_count": null,
			
 
				+   "metadata": {},
			
 
				+   "outputs": [],
			
 
				+   "source": []
			
 
				+  },
			
 
				+  {
			
 
				+   "cell_type": "code",
			
 
				+   "execution_count": null,
			
 
				+   "metadata": {},
			
 
				+   "outputs": [],
			
 
				+   "source": []
			
 
				+  },
			
 
				+  {
			
 
				+   "cell_type": "code",
			
 
				+   "execution_count": null,
			
 
				+   "metadata": {},
			
 
				+   "outputs": [],
			
 
				+   "source": []
			
 
				+  },
			
 
				+  {
			
 
				+   "cell_type": "code",
			
 
				+   "execution_count": 119,
			
 
				+   "metadata": {},
			
 
				+   "outputs": [
			
 
				+    {
			
 
				+     "data": {
			
 
				+      "text/plain": [
			
 
				+       "tensor([[-0.7130,  0.5764]], grad_fn=<AddmmBackward>)"
			
 
				+      ]
			
 
				+     },
			
 
				+     "execution_count": 119,
			
 
				+     "metadata": {},
			
 
				+     "output_type": "execute_result"
			
 
				+    }
			
 
				+   ],
			
 
				+   "source": [
			
 
				+    "kd(a, v)"
			
 
				    ]
			
 
				   },
			
 
				   {
			
--- a/dev2.ipynb
+++ b/dev2.ipynb
--- a/kissing_detector.py
+++ b/kissing_detector.py
@@ -9,7 +9,7 @@ class KissingDetector(nn.Module):
 
				         super(KissingDetector, self).__init__()
			
 
				         conv, conv_input_size, conv_output_size = convnet_init(model_name, num_classes, feature_extract,
			
 
				                                                                use_pretrained=use_pretrained)
			
 
				-        vggish_model, vggish_output_size = vggish.vggish()
			
 
				+        vggish_model, vggish_output_size = vggish.vggish(feature_extract)
			
 
				         self.conv_input_size = conv_input_size
			
 
				         self.conv = conv
			
 
				         self.vggish = vggish_model
			
--- a/pipeline.py
+++ b/pipeline.py
@@ -1,6 +1,18 @@
 
				+import math
			
 
				+import os
			
 
				+import pickle
			
 
				+import shutil
			
 
				+from typing import List, Tuple
			
 
				+
			
 
				 import cv2
			
 
				-from moviepy.video.io.ffmpeg_tools import ffmpeg_extract_subclip
			
 
				 import numpy as np
			
 
				+import torch
			
 
				+from moviepy.editor import VideoFileClip
			
 
				+from moviepy.video.io.ffmpeg_tools import ffmpeg_extract_subclip
			
 
				+
			
 
				+import vggish_input
			
 
				+
			
 
				+VGGISH_FRAME_RATE = 0.96
			
 
				 
			
 
				 
			
 
				 def slice_clips(segments, root, fps=2):
			
@@ -26,3 +38,77 @@ def slice_clips(segments, root, fps=2):
 
				                     success, image = vidcap.read()
			
 
				                     # print('Read a new frame: ', success)
			
 
				                     count += 1
			
 
				+
			
 
				+
			
 
				+class BuildDataset:
			
 
				+    def __init__(self,
			
 
				+                 base_path: str,
			
 
				+                 videos_and_labels: List[Tuple[str, str]],
			
 
				+                 output_path: str,
			
 
				+                 test_size: float = 1 / 3):
			
 
				+        assert 0 < test_size < 1
			
 
				+        self.videos_and_labels = videos_and_labels
			
 
				+        self.test_size = test_size
			
 
				+        self.output_path = output_path
			
 
				+        self.base_path = base_path
			
 
				+
			
 
				+        self.sets = ['train', 'val']
			
 
				+
			
 
				+    def _get_set(self):
			
 
				+        return np.random.choice(self.sets, p=[1 - self.test_size, self.test_size])
			
 
				+
			
 
				+    def build_dataset(self):
			
 
				+        # wipe
			
 
				+        for set_ in self.sets:
			
 
				+            path = f'{self.output_path}/{set_}'
			
 
				+            try:
			
 
				+                shutil.rmtree(path)
			
 
				+            except FileNotFoundError:
			
 
				+                pass
			
 
				+            os.makedirs(path)
			
 
				+
			
 
				+        for file_name, label in self.videos_and_labels:
			
 
				+            name, _ = file_name.split('.')
			
 
				+            path = f'{self.base_path}/{file_name}'
			
 
				+            audio, images = self.one_video_extract_audio_and_stills(path)
			
 
				+            set_ = self._get_set()
			
 
				+            target = f"{self.output_path}/{set_}/{label}_{name}.pkl"
			
 
				+            pickle.dump((audio, images, label), open(target, 'wb'))
			
 
				+
			
 
				+    @staticmethod
			
 
				+    def one_video_extract_audio_and_stills(path_video: str) -> Tuple[List[torch.Tensor],
			
 
				+                                                                     List[torch.Tensor]]:
			
 
				+        # return a list of image(s), audio tensors
			
 
				+        cap = cv2.VideoCapture(path_video)
			
 
				+        frame_rate = cap.get(5)
			
 
				+        images = []
			
 
				+
			
 
				+        # process the image
			
 
				+        while cap.isOpened():
			
 
				+            frame_id = cap.get(1)
			
 
				+            success, frame = cap.read()
			
 
				+
			
 
				+            if not success:
			
 
				+                print('Something went wrong!')
			
 
				+                break
			
 
				+
			
 
				+            if frame_id % math.floor(frame_rate * VGGISH_FRAME_RATE) == 0:
			
 
				+                images.append(frame)
			
 
				+
			
 
				+        cap.release()
			
 
				+
			
 
				+        # process the audio
			
 
				+        # TODO: hack to get around OpenMP error
			
 
				+        os.environ['KMP_DUPLICATE_LIB_OK'] = 'TRUE'
			
 
				+
			
 
				+        tmp_audio_file = 'tmp.wav'
			
 
				+        VideoFileClip(path_video).audio.write_audiofile(tmp_audio_file)
			
 
				+        audio = vggish_input.wavfile_to_examples(tmp_audio_file)
			
 
				+        # audio = audio[:, None, :, :]  # add dummy dimension for "channel"
			
 
				+        # audio = torch.from_numpy(audio).float()  # Convert input example to float
			
 
				+
			
 
				+        min_sizes = min(audio.shape[0], len(images))
			
 
				+        audio = [torch.from_numpy(audio[idx][None, :, :]).float() for idx in range(min_sizes)]
			
 
				+        images = [torch.from_numpy(img).permute((2, 1, 0)) for img in images[:min_sizes]]
			
 
				+
			
 
				+        return audio, images
			
--- a/spatial_transforms.py
+++ b/spatial_transforms.py
@@ -0,0 +1,175 @@
 
				+import random
			
 
				+import math
			
 
				+import numbers
			
 
				+import collections
			
 
				+import numpy as np
			
 
				+import torch
			
 
				+from PIL import Image, ImageOps
			
 
				+try:
			
 
				+    import accimage
			
 
				+except ImportError:
			
 
				+    accimage = None
			
 
				+
			
 
				+
			
 
				+class Compose(object):
			
 
				+    """Composes several transforms together.
			
 
				+    Args:
			
 
				+        transforms (list of ``Transform`` objects): list of transforms to compose.
			
 
				+    Example:
			
 
				+        >>> transforms.Compose([
			
 
				+        >>>     transforms.CenterCrop(10),
			
 
				+        >>>     transforms.ToTensor(),
			
 
				+        >>> ])
			
 
				+    """
			
 
				+
			
 
				+    def __init__(self, transforms):
			
 
				+        self.transforms = transforms
			
 
				+
			
 
				+    def __call__(self, img):
			
 
				+        for t in self.transforms:
			
 
				+            img = t(img)
			
 
				+        return img
			
 
				+
			
 
				+
			
 
				+class ToTensor(object):
			
 
				+    """Convert a ``PIL.Image`` or ``numpy.ndarray`` to tensor.
			
 
				+    Converts a PIL.Image or numpy.ndarray (H x W x C) in the range
			
 
				+    [0, 255] to a torch.FloatTensor of shape (C x H x W) in the range [0.0, 1.0].
			
 
				+    """
			
 
				+
			
 
				+    def __call__(self, pic):
			
 
				+        """
			
 
				+        Args:
			
 
				+            pic (PIL.Image or numpy.ndarray): Image to be converted to tensor.
			
 
				+        Returns:
			
 
				+            Tensor: Converted image.
			
 
				+        """
			
 
				+        if isinstance(pic, np.ndarray):
			
 
				+            # handle numpy array
			
 
				+            img = torch.from_numpy(pic.transpose((2, 0, 1)))
			
 
				+            # backward compatibility
			
 
				+            return img.float()
			
 
				+
			
 
				+        if accimage is not None and isinstance(pic, accimage.Image):
			
 
				+            nppic = np.zeros([pic.channels, pic.height, pic.width], dtype=np.float32)
			
 
				+            pic.copyto(nppic)
			
 
				+            return torch.from_numpy(nppic)
			
 
				+
			
 
				+        # handle PIL Image
			
 
				+        if pic.mode == 'I':
			
 
				+            img = torch.from_numpy(np.array(pic, np.int32, copy=False))
			
 
				+        elif pic.mode == 'I;16':
			
 
				+            img = torch.from_numpy(np.array(pic, np.int16, copy=False))
			
 
				+        else:
			
 
				+            img = torch.ByteTensor(torch.ByteStorage.from_buffer(pic.tobytes()))
			
 
				+        # PIL image mode: 1, L, P, I, F, RGB, YCbCr, RGBA, CMYK
			
 
				+        if pic.mode == 'YCbCr':
			
 
				+            nchannel = 3
			
 
				+        elif pic.mode == 'I;16':
			
 
				+            nchannel = 1
			
 
				+        else:
			
 
				+            nchannel = len(pic.mode)
			
 
				+        img = img.view(pic.size[1], pic.size[0], nchannel)
			
 
				+        # put it from HWC to CHW format
			
 
				+        # yikes, this transpose takes 80% of the loading time/CPU
			
 
				+        img = img.transpose(0, 1).transpose(0, 2).contiguous()
			
 
				+        if isinstance(img, torch.ByteTensor):
			
 
				+            return img.float()
			
 
				+        else:
			
 
				+            return img
			
 
				+
			
 
				+
			
 
				+class Normalize(object):
			
 
				+    """Normalize an tensor image with mean and standard deviation.
			
 
				+    Given mean: (R, G, B) and std: (R, G, B),
			
 
				+    will normalize each channel of the torch.*Tensor, i.e.
			
 
				+    channel = (channel - mean) / std
			
 
				+    Args:
			
 
				+        mean (sequence): Sequence of means for R, G, B channels respecitvely.
			
 
				+        std (sequence): Sequence of standard deviations for R, G, B channels
			
 
				+            respecitvely.
			
 
				+    """
			
 
				+
			
 
				+    def __init__(self, mean, std):
			
 
				+        self.mean = mean
			
 
				+        self.std = std
			
 
				+
			
 
				+    def __call__(self, tensor):
			
 
				+        """
			
 
				+        Args:
			
 
				+            tensor (Tensor): Tensor image of size (C, H, W) to be normalized.
			
 
				+        Returns:
			
 
				+            Tensor: Normalized image.
			
 
				+        """
			
 
				+        # TODO: make efficient
			
 
				+        for t, m, s in zip(tensor, self.mean, self.std):
			
 
				+            t.sub_(m).div_(s)
			
 
				+        return tensor
			
 
				+
			
 
				+
			
 
				+class Scale(object):
			
 
				+    """Rescale the input PIL.Image to the given size.
			
 
				+    Args:
			
 
				+        size (sequence or int): Desired output size. If size is a sequence like
			
 
				+            (w, h), output size will be matched to this. If size is an int,
			
 
				+            smaller edge of the image will be matched to this number.
			
 
				+            i.e, if height > width, then image will be rescaled to
			
 
				+            (size * height / width, size)
			
 
				+        interpolation (int, optional): Desired interpolation. Default is
			
 
				+            ``PIL.Image.BILINEAR``
			
 
				+    """
			
 
				+
			
 
				+    def __init__(self, size, interpolation=Image.BILINEAR):
			
 
				+        assert isinstance(size, int) or (isinstance(size, collections.Iterable) and len(size) == 2)
			
 
				+        self.size = size
			
 
				+        self.interpolation = interpolation
			
 
				+
			
 
				+    def __call__(self, img):
			
 
				+        """
			
 
				+        Args:
			
 
				+            img (PIL.Image): Image to be scaled.
			
 
				+        Returns:
			
 
				+            PIL.Image: Rescaled image.
			
 
				+        """
			
 
				+        if isinstance(self.size, int):
			
 
				+            w, h = img.size
			
 
				+            if (w <= h and w == self.size) or (h <= w and h == self.size):
			
 
				+                return img
			
 
				+            if w < h:
			
 
				+                ow = self.size
			
 
				+                oh = int(self.size * h / w)
			
 
				+                return img.resize((ow, oh), self.interpolation)
			
 
				+            else:
			
 
				+                oh = self.size
			
 
				+                ow = int(self.size * w / h)
			
 
				+                return img.resize((ow, oh), self.interpolation)
			
 
				+        else:
			
 
				+            return img.resize(self.size, self.interpolation)
			
 
				+
			
 
				+
			
 
				+class CenterCrop(object):
			
 
				+    """Crops the given PIL.Image at the center.
			
 
				+    Args:
			
 
				+        size (sequence or int): Desired output size of the crop. If size is an
			
 
				+            int instead of sequence like (h, w), a square crop (size, size) is
			
 
				+            made.
			
 
				+    """
			
 
				+
			
 
				+    def __init__(self, size):
			
 
				+        if isinstance(size, numbers.Number):
			
 
				+            self.size = (int(size), int(size))
			
 
				+        else:
			
 
				+            self.size = size
			
 
				+
			
 
				+    def __call__(self, img):
			
 
				+        """
			
 
				+        Args:
			
 
				+            img (PIL.Image): Image to be cropped.
			
 
				+        Returns:
			
 
				+            PIL.Image: Cropped image.
			
 
				+        """
			
 
				+        w, h = img.size
			
 
				+        th, tw = self.size
			
 
				+        x1 = int(round((w - tw) / 2.))
			
 
				+        y1 = int(round((h - th) / 2.))
			
 
				+        return img.crop((x1, y1, x1 + tw, y1 + th))
			
--- a/temporal_transforms.py
+++ b/temporal_transforms.py
@@ -0,0 +1,50 @@
 
				+import random
			
 
				+import math
			
 
				+
			
 
				+
			
 
				+class LoopPadding(object):
			
 
				+    def __init__(self, size):
			
 
				+        self.size = size
			
 
				+
			
 
				+    def __call__(self, frame_indices):
			
 
				+        out = frame_indices
			
 
				+
			
 
				+        for index in out:
			
 
				+            if len(out) >= self.size:
			
 
				+                break
			
 
				+            out.append(index)
			
 
				+
			
 
				+        return out
			
 
				+
			
 
				+
			
 
				+class TemporalCenterCrop(object):
			
 
				+    """Temporally crop the given frame indices at a center.
			
 
				+    If the number of frames is less than the size,
			
 
				+    loop the indices as many times as necessary to satisfy the size.
			
 
				+    Args:
			
 
				+        size (int): Desired output size of the crop.
			
 
				+    """
			
 
				+
			
 
				+    def __init__(self, size):
			
 
				+        self.size = size
			
 
				+
			
 
				+    def __call__(self, frame_indices):
			
 
				+        """
			
 
				+        Args:
			
 
				+            frame_indices (list): frame indices to be cropped.
			
 
				+        Returns:
			
 
				+            list: Cropped frame indices.
			
 
				+        """
			
 
				+
			
 
				+        center_index = len(frame_indices) // 2
			
 
				+        begin_index = max(0, center_index - (self.size // 2))
			
 
				+        end_index = min(begin_index + self.size, len(frame_indices))
			
 
				+
			
 
				+        out = frame_indices[begin_index:end_index]
			
 
				+
			
 
				+        for index in out:
			
 
				+            if len(out) >= self.size:
			
 
				+                break
			
 
				+            out.append(index)
			
 
				+
			
 
				+        return out
			
--- a/train.py
+++ b/train.py
@@ -1,16 +1,56 @@
 
				 import copy
			
 
				 import time
			
 
				+from typing import List, Tuple
			
 
				 
			
 
				 import torch
			
 
				 import torch.optim as optim
			
 
				 from torch import nn
			
 
				 
			
 
				-# TODO: get these properly
			
 
				-device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")
			
 
				-feature_extract = True
			
 
				-model_ft = None  # TODO
			
 
				-dataloaders_dict = None  # TODO
			
 
				-model_name = None  # TODO
			
 
				+from data import AudioVideo
			
 
				+from kissing_detector import KissingDetector
			
 
				+
			
 
				+
			
 
				+def _get_params_to_update(model: nn.Module,
			
 
				+                          feature_extract: bool) -> List[nn.parameter.Parameter]:
			
 
				+    params_to_update = model.parameters()
			
 
				+    if feature_extract:
			
 
				+        print('Params to update')
			
 
				+        params_to_update = []
			
 
				+        for name, param in model.named_parameters():
			
 
				+            if param.requires_grad is True:
			
 
				+                params_to_update.append(param)
			
 
				+                print("*", name)
			
 
				+    else:
			
 
				+        print('Updating ALL params')
			
 
				+    return params_to_update
			
 
				+
			
 
				+
			
 
				+def train_kd(model_name: str,
			
 
				+             num_epochs: int,
			
 
				+             feature_extract: bool,
			
 
				+             batch_size: int,
			
 
				+             num_workers: int=4,
			
 
				+             shuffle: bool=True,
			
 
				+             lr: float=0.001,
			
 
				+             momentum: float=0.9) -> Tuple[nn.Module, List[torch.Tensor]]:
			
 
				+    num_classes = 2
			
 
				+    kd = KissingDetector(model_name, num_classes, feature_extract)
			
 
				+    params_to_update = _get_params_to_update(kd, feature_extract)
			
 
				+
			
 
				+    datasets = {x: AudioVideo(x) for x in ['train', 'val']}
			
 
				+    dataloaders_dict = {x: torch.utils.data.DataLoader(datasets[x],
			
 
				+                                                       batch_size=batch_size,
			
 
				+                                                       shuffle=shuffle, num_workers=num_workers)
			
 
				+                        for x in ['train', 'val']}
			
 
				+    optimizer_ft = optim.SGD(params_to_update, lr=lr, momentum=momentum)
			
 
				+
			
 
				+    # Setup the loss fxn
			
 
				+    criterion = nn.CrossEntropyLoss()
			
 
				+
			
 
				+    model_ft, hist = train_model(kd,
			
 
				+                                 dataloaders_dict, criterion, optimizer_ft, num_epochs=num_epochs,
			
 
				+                                 is_inception=(model_name == "inception"))
			
 
				+    return model_ft, hist
			
 
				 
			
 
				 
			
 
				 def train_model(model, dataloaders, criterion, optimizer, num_epochs=25, is_inception=False):
			
@@ -39,8 +79,9 @@ def train_model(model, dataloaders, criterion, optimizer, num_epochs=25, is_ince
 
				             running_corrects = 0
			
 
				 
			
 
				             # Iterate over data.
			
 
				-            for inputs, labels in dataloaders[phase]:
			
 
				-                inputs = inputs.to(device)
			
 
				+            for a, v, labels in dataloaders[phase]:
			
 
				+                a = a.to(device)
			
 
				+                v = v.to(device)
			
 
				                 labels = labels.to(device)
			
 
				 
			
 
				                 # zero the parameter gradients
			
@@ -55,12 +96,12 @@ def train_model(model, dataloaders, criterion, optimizer, num_epochs=25, is_ince
 
				                     #   but in testing we only consider the final output.
			
 
				                     if is_inception and phase == 'train':
			
 
				                         # https://discuss.pytorch.org/t/how-to-optimize-inception-model-with-auxiliary-classifiers/7958
			
 
				-                        outputs, aux_outputs = model(inputs)
			
 
				+                        outputs, aux_outputs = model(a, v)
			
 
				                         loss1 = criterion(outputs, labels)
			
 
				                         loss2 = criterion(aux_outputs, labels)
			
 
				                         loss = loss1 + 0.4 * loss2
			
 
				                     else:
			
 
				-                        outputs = model(inputs)
			
 
				+                        outputs = model(a, v)
			
 
				                         loss = criterion(outputs, labels)
			
 
				 
			
 
				                     _, preds = torch.max(outputs, 1)
			
@@ -71,7 +112,7 @@ def train_model(model, dataloaders, criterion, optimizer, num_epochs=25, is_ince
 
				                         optimizer.step()
			
 
				 
			
 
				                 # statistics
			
 
				-                running_loss += loss.item() * inputs.size(0)
			
 
				+                running_loss += loss.item() * a.size(0)
			
 
				                 running_corrects += torch.sum(preds == labels.data)
			
 
				 
			
 
				             epoch_loss = running_loss / len(dataloaders[phase].dataset)
			
@@ -95,35 +136,3 @@ def train_model(model, dataloaders, criterion, optimizer, num_epochs=25, is_ince
 
				     # load best model weights
			
 
				     model.load_state_dict(best_model_wts)
			
 
				     return model, val_acc_history
			
 
				-
			
 
				-
			
 
				-# Send the model to GPU
			
 
				-model_ft = model_ft.to(device)
			
 
				-
			
 
				-# Gather the parameters to be optimized/updated in this run. If we are
			
 
				-#  finetuning we will be updating all parameters. However, if we are
			
 
				-#  doing feature extract method, we will only update the parameters
			
 
				-#  that we have just initialized, i.e. the parameters with requires_grad
			
 
				-#  is True.
			
 
				-params_to_update = model_ft.parameters()
			
 
				-print("Params to learn:")
			
 
				-if feature_extract:
			
 
				-    params_to_update = []
			
 
				-    for name, param in model_ft.named_parameters():
			
 
				-        if param.requires_grad is True:
			
 
				-            params_to_update.append(param)
			
 
				-            print("\t", name)
			
 
				-else:
			
 
				-    for name, param in model_ft.named_parameters():
			
 
				-        if param.requires_grad is True:
			
 
				-            print("\t", name)
			
 
				-
			
 
				-# Observe that all parameters are being optimized
			
 
				-optimizer_ft = optim.SGD(params_to_update, lr=0.001, momentum=0.9)
			
 
				-
			
 
				-# Setup the loss fxn
			
 
				-criterion = nn.CrossEntropyLoss()
			
 
				-
			
 
				-# Train and evaluate
			
 
				-model_ft, hist = train_model(model_ft, dataloaders_dict, criterion, optimizer_ft, num_epochs=num_epochs,
			
 
				-                             is_inception=(model_name == "inception"))
			
--- a/vggish.py
+++ b/vggish.py
@@ -1,5 +1,6 @@
 
				 from typing import Tuple
			
 
				 
			
 
				+import conv
			
 
				 import torch.nn as nn
			
 
				 from torch import hub
			
 
				 
			
@@ -46,7 +47,7 @@ Output:  128 Embedding
 
				 
			
 
				 
			
 
				 class VGGish(nn.Module):
			
 
				-    def __init__(self):
			
 
				+    def __init__(self, feature_extract: bool):
			
 
				         super(VGGish, self).__init__()
			
 
				         self.features = nn.Sequential(
			
 
				             nn.Conv2d(1, VGGishParams.NUM_BANDS, 3, 1, 1),
			
@@ -74,6 +75,8 @@ class VGGish(nn.Module):
 
				             nn.Linear(4096, VGGishParams.EMBEDDING_SIZE),
			
 
				             nn.ReLU(inplace=True),
			
 
				         )
			
 
				+        conv.set_parameter_requires_grad(self.features, feature_extract)
			
 
				+        conv.set_parameter_requires_grad(self.embeddings, feature_extract)
			
 
				 
			
 
				     def forward(self, x):
			
 
				         x = self.features(x)
			
@@ -82,11 +85,11 @@ class VGGish(nn.Module):
 
				         return x
			
 
				 
			
 
				 
			
 
				-def vggish() -> Tuple[VGGish, int]:
			
 
				+def vggish(feature_extract: bool) -> Tuple[VGGish, int]:
			
 
				     """
			
 
				     VGGish is a PyTorch implementation of Tensorflow's VGGish architecture used to create embeddings
			
 
				     for Audioset. It produces a 128-d embedding of a 96ms slice of audio. Always comes pretrained.
			
 
				     """
			
 
				-    model = VGGish()
			
 
				+    model = VGGish(feature_extract)
			
 
				     model.load_state_dict(hub.load_state_dict_from_url(VGGISH_WEIGHTS), strict=True)
			
 
				     return model, VGGishParams.EMBEDDING_SIZE