5 years ago · ac2fe34279
--- a/README.md
+++ b/README.md
@@ -9,6 +9,9 @@ Use Python 3.6+
 
															 python3 experiments.py
														
 
															 ```
														
 
															+## Requirements
														
 
															+This is a PyTorch project. Look at `requirements.txt` for more details. 
														
 
															+
														
 
															 this will run the experiments in `params.py` specified by the `experiments` dictionary.
														
 
															 ## Build dataset
														
@@ -40,6 +43,28 @@ builder = BuildDataset(base_path='path/to/movies',
 
															 builder.build_dataset()
														
 
															 ```
														
 
															+## Detect kissing segments in a given video
														
 
															+```python
														
 
															+from segmentor import Segmentor
														
 
															+import utils
														
 
															+
														
 
															+# download model.pkl from https://drive.google.com/file/d/1RlvvdInTXtJikGv_ZbHcKoblCypN1Z0A/view?usp=sharing
														
 
															+# or train your own
														
 
															+model = utils.unpickle('model.pkl')  # pickled PyTorch model 
														
 
															+s = Segmentor(model, min_frames=10, threshold=0.7)
														
 
															+
														
 
															+# For YouTube clip Hot Summer Nights - Kiss Scene (Maika Monroe and Timothee Chalamet)
														
 
															+# at https://www.youtube.com/watch?v=GG5HmLQ_Fx0
														
 
															+# v=XXX is the YouTube ID, pass that here 
														
 
															+s.visualize_segments_youtube('GG5HmLQ_Fx0')
														
 
															+
														
 
															+# alternatively you can provide a path to a local mp4 file
														
 
															+s.visualize_segments('path/to/file.mp4')
														
 
															+```
														
 
															+
														
 
															+See examples in [examples/detector.ipynb](examples/detector.ipynb).
														
 
															+
														
 
															 ## Heavily used the following resources:
														
 
															 - [Video Classification Using 3D ResNet](https://github.com/kenshohara/video-classification-3d-cnn-pytorch)
														
 
															-- [CS231N assignment 3](http://cs231n.github.io/assignments2019/assignment3/)
														
 
															+- [AudioSet](https://research.google.com/audioset/download.html)
														
 
															+- [CS231N Saliency maps and class viz PyTorch code](http://cs231n.github.io/assignments2019/assignment3/)
														
--- a/data.py
+++ b/data.py
@@ -1,14 +1,9 @@
 
															-import copy
														
 
															-import functools
														
 
															-import json
														
 
															-import os
														
 
															 import pickle
														
 
															 from glob import glob
														
 
															 from typing import Tuple, List
														
 
															 import torch
														
 
															 import torch.utils.data as data
														
 
															-from PIL import Image
														
 
															 class AV(data.Dataset):
														
@@ -71,133 +66,3 @@ class AudioVideo3D(AV):
 
															             ee = e.permute((1, 0, 2, 3))
														
 
															             out.append(ee)
														
 
															         return out
														
 
															-
														
 
															-
														
 
															-def pil_loader(path):
														
 
															-    # open path as file to avoid ResourceWarning (https://github.com/python-pillow/Pillow/issues/835)
														
 
															-    with open(path, 'rb') as f:
														
 
															-        with Image.open(f) as img:
														
 
															-            return img.convert('RGB')
														
 
															-
														
 
															-
														
 
															-def accimage_loader(path):
														
 
															-    # try:
														
 
															-    #     return accimage.Image(path)
														
 
															-    # except IOError:
														
 
															-    #     # Potentially a decoding problem, fall back to PIL.Image
														
 
															-    #     return pil_loader(path)
														
 
															-    return pil_loader(path)
														
 
															-
														
 
															-
														
 
															-def get_default_image_loader():
														
 
															-    from torchvision import get_image_backend
														
 
															-    if get_image_backend() == 'accimage':
														
 
															-        return accimage_loader
														
 
															-    else:
														
 
															-        return pil_loader
														
 
															-
														
 
															-
														
 
															-def video_loader(video_dir_path, frame_indices, image_loader):
														
 
															-    video = []
														
 
															-    for i in frame_indices:
														
 
															-        image_path = os.path.join(video_dir_path, 'image_{:05d}.jpg'.format(i))
														
 
															-        if os.path.exists(image_path):
														
 
															-            video.append(image_loader(image_path))
														
 
															-        else:
														
 
															-            return video
														
 
															-
														
 
															-    return video
														
 
															-
														
 
															-
														
 
															-def get_default_video_loader():
														
 
															-    image_loader = get_default_image_loader()
														
 
															-    return functools.partial(video_loader, image_loader=image_loader)
														
 
															-
														
 
															-
														
 
															-def load_annotation_data(data_file_path):
														
 
															-    with open(data_file_path, 'r') as data_file:
														
 
															-        return json.load(data_file)
														
 
															-
														
 
															-
														
 
															-def get_class_labels(data):
														
 
															-    class_labels_map = {}
														
 
															-    index = 0
														
 
															-    for class_label in data['labels']:
														
 
															-        class_labels_map[class_label] = index
														
 
															-        index += 1
														
 
															-    return class_labels_map
														
 
															-
														
 
															-
														
 
															-def get_video_names_and_annotations(data, subset):
														
 
															-    video_names = []
														
 
															-    annotations = []
														
 
															-
														
 
															-    for key, value in data['database'].items():
														
 
															-        this_subset = value['subset']
														
 
															-        if this_subset == subset:
														
 
															-            if subset == 'testing':
														
 
															-                video_names.append('test/{}'.format(key))
														
 
															-            else:
														
 
															-                label = value['annotations']['label']
														
 
															-                video_names.append('{}/{}'.format(label, key))
														
 
															-                annotations.append(value['annotations'])
														
 
															-
														
 
															-    return video_names, annotations
														
 
															-
														
 
															-
														
 
															-def make_dataset(video_path, sample_duration):
														
 
															-    dataset = []
														
 
															-
														
 
															-    n_frames = len(os.listdir(video_path))
														
 
															-
														
 
															-    begin_t = 1
														
 
															-    end_t = n_frames
														
 
															-    sample = {
														
 
															-        'video': video_path,
														
 
															-        'segment': [begin_t, end_t],
														
 
															-        'n_frames': n_frames,
														
 
															-    }
														
 
															-
														
 
															-    step = sample_duration
														
 
															-    for i in range(1, (n_frames - sample_duration + 1), step):
														
 
															-        sample_i = copy.deepcopy(sample)
														
 
															-        sample_i['frame_indices'] = list(range(i, i + sample_duration))
														
 
															-        sample_i['segment'] = torch.IntTensor([i, i + sample_duration - 1])
														
 
															-        dataset.append(sample_i)
														
 
															-
														
 
															-    return dataset
														
 
															-
														
 
															-
														
 
															-class Video(data.Dataset):
														
 
															-    def __init__(self, video_path,
														
 
															-                 spatial_transform=None, temporal_transform=None,
														
 
															-                 sample_duration=16, get_loader=get_default_video_loader):
														
 
															-        self.data = make_dataset(video_path, sample_duration)
														
 
															-
														
 
															-        self.spatial_transform = spatial_transform
														
 
															-        self.temporal_transform = temporal_transform
														
 
															-        self.loader = get_loader()
														
 
															-
														
 
															-    def __getitem__(self, index):
														
 
															-        """
														
 
															-        Args:
														
 
															-            index (int): Index
														
 
															-        Returns:
														
 
															-            tuple: (image, target) where target is class_index of the target class.
														
 
															-        """
														
 
															-        path = self.data[index]['video']
														
 
															-
														
 
															-        frame_indices = self.data[index]['frame_indices']
														
 
															-        if self.temporal_transform is not None:
														
 
															-            frame_indices = self.temporal_transform(frame_indices)
														
 
															-        clip = self.loader(path, frame_indices)
														
 
															-        if self.spatial_transform is not None:
														
 
															-            clip = [self.spatial_transform(img) for img in clip]
														
 
															-        clip = torch.stack(clip, 0).permute(1, 0, 2, 3)
														
 
															-
														
 
															-        target = self.data[index]['segment']
														
 
															-
														
 
															-        return clip, target
														
 
															-
														
 
															-    def __len__(self):
														
 
															-        return len(self.data)
														
--- a/examples/detector.ipynb
+++ b/examples/detector.ipynb
--- a/kissing_detector.py
+++ b/kissing_detector.py
@@ -65,7 +65,7 @@ class KissingDetector3DConv(nn.Module):
 
															             num_classes=num_classes,
														
 
															             shortcut_type='B',
														
 
															             sample_size=224,
														
 
															-            sample_duration=10
														
 
															+            sample_duration=16
														
 
															         )
														
 
															         set_parameter_requires_grad(conv, feature_extract)
														
 
															         conv.fc = nn.Identity()
														
--- a/params.py
+++ b/params.py
@@ -7,6 +7,7 @@ data_path_base = 'vtest_new2'
 
															 mean = np.array([0.485, 0.456, 0.406])
														
 
															 std = np.array([0.229, 0.224, 0.225])
														
 
															+vggish_frame_rate = 0.96
														
 
															 # test end-to-end
														
 
															 experiment_test = {
														
--- a/pipeline.py
+++ b/pipeline.py
@@ -9,39 +9,12 @@ import numpy as np
 
															 import torch
														
 
															 from PIL import Image
														
 
															 from moviepy.editor import VideoFileClip
														
 
															-from moviepy.video.io.ffmpeg_tools import ffmpeg_extract_subclip
														
 
															+
														
 
															 from torchvision import transforms
														
 
															 import params
														
 
															 import vggish_input
														
 
															-VGGISH_FRAME_RATE = 0.96
														
 
															-
														
 
															-
														
 
															-def slice_clips(segments, root, fps=2):
														
 
															-    for path, classes in segments.items():
														
 
															-
														
 
															-        for cls, ts in classes.items():
														
 
															-            for i, (t1, t2) in enumerate(ts):
														
 
															-                set_ = np.random.choice(['train', 'val'], p=[2 / 3, 1 / 3])
														
 
															-                # get all the still frames
														
 
															-                file_name, ext = path.split('.')
														
 
															-                target = f"{root}{file_name}_{cls}_{i + 1}.{ext}"
														
 
															-                print(f'target: {target}')
														
 
															-                ffmpeg_extract_subclip(f'{root}{path}', t1, t2, targetname=target)
														
 
															-                vidcap = cv2.VideoCapture(target)
														
 
															-                vidcap.set(cv2.CAP_PROP_FPS, fps)
														
 
															-                print(cv2.CAP_PROP_FPS)
														
 
															-                success, image = vidcap.read()
														
 
															-                count = 0
														
 
															-                while success:
														
 
															-                    frame_path = f'{root}casino/{set_}/{cls}/{file_name}_{i}_{count + 1}.jpg'
														
 
															-                    # print(frame_path)
														
 
															-                    cv2.imwrite(frame_path, image)  # save frame as JPEG file
														
 
															-                    success, image = vidcap.read()
														
 
															-                    # print('Read a new frame: ', success)
														
 
															-                    count += 1
														
 
															-
														
 
															 class BuildDataset:
														
 
															     def __init__(self,
														
@@ -117,7 +90,7 @@ class BuildDataset:
 
															                 print('Something went wrong!')
														
 
															                 break
														
 
															-            if frame_id % math.floor(frame_rate * VGGISH_FRAME_RATE) == 0:
														
 
															+            if frame_id % math.floor(frame_rate * params.vggish_frame_rate) == 0:
														
 
															                 frame_pil = Image.fromarray(frame, mode='RGB')
														
 
															                 images.append(transformer(frame_pil))
														
 
															                 # images += [transformer(frame_pil) for _ in range(self.n_augment)]