From 08b32428f39831a2fdfdfa1638e937f4f2568bd3 Mon Sep 17 00:00:00 2001
From: zhong-al <74470739+zhong-al@users.noreply.github.com>
Date: Tue, 15 Oct 2024 21:09:24 -0400
Subject: [PATCH 01/20] Type annotate miniscene2behavior

---
 src/kabr_tools/miniscene2behavior.py | 76 +++++++++++++++-------------
 1 file changed, 40 insertions(+), 36 deletions(-)

diff --git a/src/kabr_tools/miniscene2behavior.py b/src/kabr_tools/miniscene2behavior.py
index 022ffce..6b000a9 100644
--- a/src/kabr_tools/miniscene2behavior.py
+++ b/src/kabr_tools/miniscene2behavior.py
@@ -1,19 +1,21 @@
 import sys
+import argparse
 import torch
 from lxml import etree
 import pandas as pd
 import cv2
-import argparse
 from tqdm import tqdm
 import slowfast.utils.checkpoint as cu
-import slowfast.models.build as build
-import slowfast.utils.parser as parser
+from slowfast.models import build
+from slowfast.utils import parser
 from slowfast.datasets.utils import get_sequence
 from slowfast.visualization.utils import process_cv2_inputs
 from slowfast.datasets.cv2_transform import scale
+from fvcore.common.config import CfgNode
+from torch import Tensor
 
 
-def get_input_clip(cap, cfg, keyframe_idx):
+def get_input_clip(cap: cv2.VideoCapture, cfg: CfgNode, keyframe_idx: int) -> list[Tensor]:
     # https://github.com/facebookresearch/SlowFast/blob/bac7b672f40d44166a84e8c51d1a5ba367ace816/slowfast/visualization/ava_demo_precomputed_boxes.py
     seq_length = cfg.DATA.NUM_FRAMES * cfg.DATA.SAMPLING_RATE
     total_frames = int(cap.get(cv2.CAP_PROP_FRAME_COUNT))
@@ -32,7 +34,7 @@ def get_input_clip(cap, cfg, keyframe_idx):
             frame = scale(cfg.DATA.TEST_CROP_SIZE, frame)
             clip.append(frame)
         else:
-            print('Unable to read frame. Duplicating previous frame.')
+            print("Unable to read frame. Duplicating previous frame.")
             clip.append(clip[-1])
 
     clip = process_cv2_inputs(clip, cfg)
@@ -42,57 +44,57 @@ def get_input_clip(cap, cfg, keyframe_idx):
 def parse_args():
     local_parser = argparse.ArgumentParser()
     local_parser.add_argument(
-        '--config',
+        "--config",
         type=str,
-        help='model config.yml filepath',
-        default='config.yml'
+        help="model config.yml filepath",
+        default="config.yml"
     )
     local_parser.add_argument(
-        '--checkpoint',
+        "--checkpoint",
         type=str,
-        help='model checkpoint.pyth filepath',
+        help="model checkpoint.pyth filepath",
         required=True
     )
     local_parser.add_argument(
-        '--gpu_num',
+        "--gpu_num",
         type=int,
-        help='number of gpus',
+        help="number of gpus",
         default=0
     )
     local_parser.add_argument(
-        '--miniscene',
+        "--miniscene",
         type=str,
-        help='miniscene folder containing miniscene\'s tracks.xml & *.mp4',
+        help="miniscene folder containing miniscene\'s tracks.xml & *.mp4",
         required=True
     )
     local_parser.add_argument(
-        '--video',
+        "--video",
         type=str,
-        help='name of video (expect video_tracks.xml from tracks_extractor)',
+        help="name of video (expect video_tracks.xml from tracks_extractor)",
         required=True
     )
     local_parser.add_argument(
-        '--output',
+        "--output",
         type=str,
-        help='filepath for output csv',
-        default='annotation_data.csv'
+        help="filepath for output csv",
+        default="annotation_data.csv"
     )
 
     return local_parser.parse_args()
 
 
-def create_model(config_path, checkpoint_path, gpu_num):
+def create_model(config_path: str, checkpoint_path: str, gpu_num: int) -> tuple[CfgNode, torch.nn.Module]:
     # load model config
     try:
         cfg = parser.load_config(parser.parse_args(), config_path)
     except FileNotFoundError:
         checkpoint = torch.load(
-            checkpoint_path, map_location=torch.device('cpu'))
-        with open(config_path, 'w') as file:
-            file.write(checkpoint['cfg'])
+            checkpoint_path, map_location=torch.device("cpu"))
+        with open(config_path, "w") as file:
+            file.write(checkpoint["cfg"])
         cfg = parser.load_config(parser.parse_args(), config_path)
     cfg.NUM_GPUS = gpu_num
-    cfg.OUTPUT_DIR = ''
+    cfg.OUTPUT_DIR = ""
     model = build.build_model(cfg)
 
     # load model checkpoint
@@ -103,9 +105,9 @@ def create_model(config_path, checkpoint_path, gpu_num):
     return cfg, model
 
 
-def annotate_miniscene(cfg, model, miniscene_path, video, output_path):
+def annotate_miniscene(cfg: CfgNode, model: torch.nn.Module, miniscene_path: str, video: str, output_path: str) -> None:
     label_data = []
-    track_file = f'{miniscene_path}/metadata/{video}_tracks.xml'
+    track_file = f"{miniscene_path}/metadata/{video}_tracks.xml"
     root = etree.parse(track_file).getroot()
 
     # find all tracks
@@ -115,15 +117,17 @@ def annotate_miniscene(cfg, model, miniscene_path, video, output_path):
         tracks.append(track_id)
 
     # find all frames
+    assert len(tracks) > 0, "No tracks found in track file"
+    track = tracks[-1]
     frames = []
     for box in track.iterfind("box"):
-        frames.append(int(box.attrib['frame']))
+        frames.append(int(box.attrib["frame"]))
 
     # run model on miniscene
     for track in tracks:
         video_file = f"{miniscene_path}/{track}.mp4"
         cap = cv2.VideoCapture(video_file)
-        for frame in tqdm(frames, desc=f'{track} frames'):
+        for frame in tqdm(frames, desc=f"{track} frames"):
             inputs = get_input_clip(cap, cfg, frame)
 
             if cfg.NUM_GPUS:
@@ -140,17 +144,17 @@ def annotate_miniscene(cfg, model, miniscene_path, video, output_path):
             if cfg.NUM_GPUS:
                 preds = preds.cpu()
 
-            label_data.append({'video': video,
-                               'track': track,
-                               'frame': frame,
-                               'label': torch.argmax(preds).item()})
+            label_data.append({"video": video,
+                               "track": track,
+                               "frame": frame,
+                               "label": torch.argmax(preds).item()})
             if frame % 20 == 0:
                 pd.DataFrame(label_data).to_csv(
-                    output_path, sep=' ', index=False)
-    pd.DataFrame(label_data).to_csv(output_path, sep=' ', index=False)
+                    output_path, sep=" ", index=False)
+    pd.DataFrame(label_data).to_csv(output_path, sep=" ", index=False)
 
 
-def main():
+def main() -> None:
     # clear arguments to avoid slowfast parsing issues
     args = parse_args()
     sys.argv = [sys.argv[0]]
@@ -159,5 +163,5 @@ def main():
                        args.video, args.output)
 
 
-if __name__ == '__main__':
+if __name__ == "__main__":
     main()

From d747a3ee1b77d5563120e402d636d65ba188f610 Mon Sep 17 00:00:00 2001
From: zhong-al <74470739+zhong-al@users.noreply.github.com>
Date: Tue, 15 Oct 2024 21:10:29 -0400
Subject: [PATCH 02/20] Type annotate parse_args

---
 src/kabr_tools/miniscene2behavior.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/src/kabr_tools/miniscene2behavior.py b/src/kabr_tools/miniscene2behavior.py
index 6b000a9..5a1a87e 100644
--- a/src/kabr_tools/miniscene2behavior.py
+++ b/src/kabr_tools/miniscene2behavior.py
@@ -41,7 +41,7 @@ def get_input_clip(cap: cv2.VideoCapture, cfg: CfgNode, keyframe_idx: int) -> li
     return clip
 
 
-def parse_args():
+def parse_args() -> argparse.Namespace:
     local_parser = argparse.ArgumentParser()
     local_parser.add_argument(
         "--config",

From dc9d6ae396ec168baee716cce6c11eb125ccfb84 Mon Sep 17 00:00:00 2001
From: zhong-al <74470739+zhong-al@users.noreply.github.com>
Date: Tue, 15 Oct 2024 21:19:49 -0400
Subject: [PATCH 03/20] Annotate tracks extractor

---
 src/kabr_tools/tracks_extractor.py | 30 +++++++++++++++---------------
 1 file changed, 15 insertions(+), 15 deletions(-)

diff --git a/src/kabr_tools/tracks_extractor.py b/src/kabr_tools/tracks_extractor.py
index 4beff99..6fbab25 100644
--- a/src/kabr_tools/tracks_extractor.py
+++ b/src/kabr_tools/tracks_extractor.py
@@ -15,7 +15,7 @@
 from tqdm import tqdm
 
 
-def generate_timeline_image(name, folder, timeline, annotated_size):
+def generate_timeline_image(name: str, folder: str, timeline: OrderedDict, annotated_size: int) -> None:
     timeline_image = np.zeros(shape=(len(timeline["tracks"].keys()) * 100, annotated_size, 3), dtype=np.uint8)
 
     for i, (key, value) in enumerate(timeline["tracks"].items()):
@@ -47,7 +47,7 @@ def generate_timeline_image(name, folder, timeline, annotated_size):
     cv2.imwrite(f"mini-scenes/{folder}/metadata/{name}.jpg", timeline_resized)
 
 
-def extract(video_path, annotation_path, tracking, show):
+def extract(video_path: str, annotation_path: str, tracking: bool, show: bool) -> None:
     # Parse CVAT for video 1.1 annotation file.
     root = etree.parse(annotation_path).getroot()
     annotated = dict()
@@ -180,7 +180,7 @@ def extract(video_path, annotation_path, tracking, show):
     vw.release()
     cv2.destroyAllWindows()
 
-def tracks_extractor(video, annotation, tracking, show):
+def tracks_extractor(video: str, annotation: str, tracking: bool, show: bool) -> None:
     if os.path.isdir(annotation):
         videos = []
         annotations = []
@@ -208,34 +208,34 @@ def tracks_extractor(video, annotation, tracking, show):
         extract(video, annotation, tracking, show)
 
 
-def parse_args():
+def parse_args() -> argparse.Namespace:
     local_parser = argparse.ArgumentParser()
     local_parser.add_argument(
-        '--video',
+        "--video",
         type=str,
-        help='path to folder containing videos',
+        help="path to folder containing videos",
         required=True
     )
     local_parser.add_argument(
-        '--annotation',
+        "--annotation",
         type=str,
-        help='path to folder containing annotations',
+        help="path to folder containing annotations",
         required=True
     )
     local_parser.add_argument(
-        '--tracking',
-        action='store_true',
-        help='Flag to use external tracker instead of CVAT tracks'
+        "--tracking",
+        action="store_true",
+        help="Flag to use external tracker instead of CVAT tracks"
     )
     local_parser.add_argument(
-        '--imshow',
-        action='store_true',
-        help='Flag to display tracks\' visualization'
+        "--imshow",
+        action="store_true",
+        help="Flag to display tracks\' visualization"
     )
     return local_parser.parse_args()
 
 
-def main():
+def main() -> None:
     args = parse_args()
     tracks_extractor(args.video, args.annotation, args.tracking, args.imshow)
 

From 952cc404469bae94b03fe8930c919464f78349cf Mon Sep 17 00:00:00 2001
From: zhong-al <74470739+zhong-al@users.noreply.github.com>
Date: Tue, 15 Oct 2024 21:31:17 -0400
Subject: [PATCH 04/20] Fix miniscene2behavior

---
 src/kabr_tools/miniscene2behavior.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/src/kabr_tools/miniscene2behavior.py b/src/kabr_tools/miniscene2behavior.py
index 5a1a87e..d9c93dc 100644
--- a/src/kabr_tools/miniscene2behavior.py
+++ b/src/kabr_tools/miniscene2behavior.py
@@ -117,8 +117,8 @@ def annotate_miniscene(cfg: CfgNode, model: torch.nn.Module, miniscene_path: str
         tracks.append(track_id)
 
     # find all frames
+    # TODO: rewrite - some tracks may have different frames
     assert len(tracks) > 0, "No tracks found in track file"
-    track = tracks[-1]
     frames = []
     for box in track.iterfind("box"):
         frames.append(int(box.attrib["frame"]))

From b78b85f8cd3f9b4d95e79d3ca200b80f82b8dfa9 Mon Sep 17 00:00:00 2001
From: zhong-al <74470739+zhong-al@users.noreply.github.com>
Date: Tue, 15 Oct 2024 21:33:54 -0400
Subject: [PATCH 05/20] Type annotate cvat2slowfast

---
 src/kabr_tools/cvat2slowfast.py | 9 ++++-----
 1 file changed, 4 insertions(+), 5 deletions(-)

diff --git a/src/kabr_tools/cvat2slowfast.py b/src/kabr_tools/cvat2slowfast.py
index 54cd4de..6e689ae 100644
--- a/src/kabr_tools/cvat2slowfast.py
+++ b/src/kabr_tools/cvat2slowfast.py
@@ -1,5 +1,6 @@
 import os
 import sys
+from typing import Optional
 import argparse
 import json
 from lxml import etree
@@ -9,9 +10,7 @@
 import cv2
 
 
-def cvat2slowfast(path_to_mini_scenes, path_to_new_dataset, label2number, old2new):
-    number2label = {value: key for key, value in label2number.items()}
-
+def cvat2slowfast(path_to_mini_scenes: str, path_to_new_dataset: str, label2number: dict, old2new: Optional[dict]) -> None:
     if not os.path.exists(path_to_new_dataset):
         os.makedirs(path_to_new_dataset)
 
@@ -143,7 +142,7 @@ def cvat2slowfast(path_to_mini_scenes, path_to_new_dataset, label2number, old2ne
         f"{path_to_new_dataset}/annotation/data.csv", sep=" ", index=False)
 
 
-def parse_args():
+def parse_args() -> argparse.Namespace:
     local_parser = argparse.ArgumentParser()
     local_parser.add_argument(
         '--miniscene',
@@ -172,7 +171,7 @@ def parse_args():
     return local_parser.parse_args()
 
 
-def main():
+def main() -> None:
     args = parse_args()
 
     with open(args.classes, mode='r', encoding='utf-8') as file:

From 15058418ede67e3f21bbc9f611afcb9d313124ba Mon Sep 17 00:00:00 2001
From: zhong-al <74470739+zhong-al@users.noreply.github.com>
Date: Tue, 15 Oct 2024 21:39:43 -0400
Subject: [PATCH 06/20] Type annotate cvat2ultralytics

---
 src/kabr_tools/cvat2ultralytics.py | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/src/kabr_tools/cvat2ultralytics.py b/src/kabr_tools/cvat2ultralytics.py
index bc0dd4c..5bb9bf0 100644
--- a/src/kabr_tools/cvat2ultralytics.py
+++ b/src/kabr_tools/cvat2ultralytics.py
@@ -1,4 +1,5 @@
 import os
+from typing import Optional
 import argparse
 import json
 import cv2
@@ -10,7 +11,7 @@
 from natsort import natsorted
 
 
-def cvat2ultralytics(video_path, annotation_path, dataset, skip, label2index=None):
+def cvat2ultralytics(video_path: str, annotation_path: str, dataset: str, skip: int, label2index: Optional[dict] = None):
     # Create a YOLO dataset structure.
     dataset_file = f"""
     path: {dataset}

From 5ca8c98457256e2b12d773afd12b8b11e8b739fb Mon Sep 17 00:00:00 2001
From: zhong-al <74470739+zhong-al@users.noreply.github.com>
Date: Tue, 15 Oct 2024 21:45:15 -0400
Subject: [PATCH 07/20] Type annotate detector2cvat

---
 src/kabr_tools/cvat2slowfast.py      |  3 ++-
 src/kabr_tools/cvat2ultralytics.py   |  8 +++++---
 src/kabr_tools/detector2cvat.py      | 15 +++++++--------
 src/kabr_tools/miniscene2behavior.py |  4 +++-
 4 files changed, 17 insertions(+), 13 deletions(-)

diff --git a/src/kabr_tools/cvat2slowfast.py b/src/kabr_tools/cvat2slowfast.py
index 6e689ae..fbdcd92 100644
--- a/src/kabr_tools/cvat2slowfast.py
+++ b/src/kabr_tools/cvat2slowfast.py
@@ -10,7 +10,8 @@
 import cv2
 
 
-def cvat2slowfast(path_to_mini_scenes: str, path_to_new_dataset: str, label2number: dict, old2new: Optional[dict]) -> None:
+def cvat2slowfast(path_to_mini_scenes: str, path_to_new_dataset: str,
+                  label2number: dict, old2new: Optional[dict]) -> None:
     if not os.path.exists(path_to_new_dataset):
         os.makedirs(path_to_new_dataset)
 
diff --git a/src/kabr_tools/cvat2ultralytics.py b/src/kabr_tools/cvat2ultralytics.py
index 5bb9bf0..5b757c8 100644
--- a/src/kabr_tools/cvat2ultralytics.py
+++ b/src/kabr_tools/cvat2ultralytics.py
@@ -11,7 +11,9 @@
 from natsort import natsorted
 
 
-def cvat2ultralytics(video_path: str, annotation_path: str, dataset: str, skip: int, label2index: Optional[dict] = None):
+def cvat2ultralytics(video_path: str, annotation_path: str,
+                     dataset: str, skip: int,
+                     label2index: Optional[dict] = None) -> None:
     # Create a YOLO dataset structure.
     dataset_file = f"""
     path: {dataset}
@@ -170,7 +172,7 @@ def cvat2ultralytics(video_path: str, annotation_path: str, dataset: str, skip:
         shutil.move(f"{dataset}/labels/train/{file}", f"{dataset}/labels/test/{file}")
 
 
-def parse_args():
+def parse_args() -> argparse.Namespace:
     local_parser = argparse.ArgumentParser()
     local_parser.add_argument(
         '--video',
@@ -205,7 +207,7 @@ def parse_args():
     return local_parser.parse_args()
 
 
-def main():
+def main() -> None:
     args = parse_args()
 
     if args.label2index:
diff --git a/src/kabr_tools/detector2cvat.py b/src/kabr_tools/detector2cvat.py
index 6a4c5b8..58b2f84 100644
--- a/src/kabr_tools/detector2cvat.py
+++ b/src/kabr_tools/detector2cvat.py
@@ -8,8 +8,7 @@
 from kabr_tools.utils.draw import Draw
 
 
-
-def detector2cvat(path_to_videos, path_to_save):
+def detector2cvat(path_to_videos: str, path_to_save: str) -> None:
     videos = []
 
     for root, dirs, files in os.walk(path_to_videos):
@@ -97,24 +96,24 @@ def detector2cvat(path_to_videos, path_to_save):
             print("Something went wrong...")
 
 
-def parse_args():
+def parse_args() -> argparse.Namespace:
     local_parser = argparse.ArgumentParser()
     local_parser.add_argument(
-        '--video',
+        "--video",
         type=str,
-        help='path to folder containing videos',
+        help="path to folder containing videos",
         required=True
     )
     local_parser.add_argument(
-        '--save',
+        "--save",
         type=str,
-        help='path to save output xml & mp4 files',
+        help="path to save output xml & mp4 files",
         required=True
     )
     return local_parser.parse_args()
 
 
-def main():
+def main() -> None:
     args = parse_args()
     detector2cvat(args.video, args.save)
 
diff --git a/src/kabr_tools/miniscene2behavior.py b/src/kabr_tools/miniscene2behavior.py
index d9c93dc..ad7b6ed 100644
--- a/src/kabr_tools/miniscene2behavior.py
+++ b/src/kabr_tools/miniscene2behavior.py
@@ -105,7 +105,9 @@ def create_model(config_path: str, checkpoint_path: str, gpu_num: int) -> tuple[
     return cfg, model
 
 
-def annotate_miniscene(cfg: CfgNode, model: torch.nn.Module, miniscene_path: str, video: str, output_path: str) -> None:
+def annotate_miniscene(cfg: CfgNode, model: torch.nn.Module,
+                       miniscene_path: str, video: str,
+                       output_path: str) -> None:
     label_data = []
     track_file = f"{miniscene_path}/metadata/{video}_tracks.xml"
     root = etree.parse(track_file).getroot()

From a90608b61f7101ce94b373d5645fd0d64db27431 Mon Sep 17 00:00:00 2001
From: zhong-al <74470739+zhong-al@users.noreply.github.com>
Date: Tue, 15 Oct 2024 21:59:05 -0400
Subject: [PATCH 08/20] Type annotate player

---
 src/kabr_tools/player.py | 26 +++++++++++++++-----------
 1 file changed, 15 insertions(+), 11 deletions(-)

diff --git a/src/kabr_tools/player.py b/src/kabr_tools/player.py
index d440085..28cb460 100644
--- a/src/kabr_tools/player.py
+++ b/src/kabr_tools/player.py
@@ -1,12 +1,13 @@
 import os
 import argparse
 import json
-from lxml import etree
 from collections import OrderedDict
+from lxml import etree
 import cv2
+from cv2.typing import MatLike
 
 
-def on_slider_change(value):
+def on_slider_change(value: int) -> None:
     global index, vcs, current, trackbar_position, paused, updated
     index = value
 
@@ -17,7 +18,7 @@ def on_slider_change(value):
             updated = True
 
 
-def pad(image, width, height):
+def pad(image: MatLike, width: int, height: int) -> MatLike:
     shape_0, shape_1 = image.shape[0], image.shape[1]
 
     if shape_0 < shape_1:
@@ -34,7 +35,7 @@ def pad(image, width, height):
     return padded
 
 
-def draw_aim(current, image):
+def draw_aim(current: str, image: MatLike) -> MatLike:
     if current == "main":
         return image
 
@@ -47,7 +48,8 @@ def draw_aim(current, image):
     return cv2.addWeighted(image, 0.4, copied, 0.6, 0.0)
 
 
-def draw_id(current, image, metadata, width):
+def draw_id(current: str, image: MatLike,
+            metadata: dict, width: int) -> MatLike:
     if current == "main":
         label = f"Drone View"
         color = (127, 127, 127)
@@ -68,7 +70,9 @@ def draw_id(current, image, metadata, width):
     return cv2.addWeighted(image, 0.4, copied, 0.6, 0.0)
 
 
-def draw_actions(current, index, image, actions, metadata, width, height):
+def draw_actions(current: str, index: int,
+                 image: MatLike, actions: OrderedDict,
+                 metadata: dict, width: int, height: int) -> MatLike:
     if current == "main":
         return image
 
@@ -92,7 +96,7 @@ def draw_actions(current, index, image, actions, metadata, width, height):
     return cv2.addWeighted(image, 0.4, copied, 0.6, 0.0)
 
 
-def draw_info(image, width):
+def draw_info(image: MatLike, width: int) -> MatLike:
     copied = image.copy()
     cv2.rectangle(image, (width - 600, 100), (width - 100, 340), (0, 0, 0), -1)
     cv2.putText(image, "[0-9]: Show Track #[0-9]", (width - 565, 150),
@@ -107,7 +111,7 @@ def draw_info(image, width):
     return cv2.addWeighted(image, 0.4, copied, 0.6, 0.0)
 
 
-def hotkey(key):
+def hotkey(key: int) -> None:
     global current, metadata, vc, letter2hotkey
 
     mapped = letter2hotkey[key]
@@ -130,7 +134,7 @@ def hotkey(key):
 
                     vc.set(cv2.CAP_PROP_POS_FRAMES, metadata["tracks"][current][index])
 
-def player(folder, save):
+def player(folder: str, save: bool) -> None:
     name = folder.split("/")[-1].split('|')[-1]
 
     metadata_path = f"{folder}/metadata/{name}_metadata.json"
@@ -269,7 +273,7 @@ def player(folder, save):
     cv2.destroyAllWindows()
 
 
-def parse_args():
+def parse_args() -> argparse.Namespace:
     local_parser = argparse.ArgumentParser()
     local_parser.add_argument(
         '--folder',
@@ -285,7 +289,7 @@ def parse_args():
     return local_parser.parse_args()
 
 
-def main():
+def main() -> None:
     args = parse_args()
     player(args.folder, args.save)
 

From e53c6d8bf52822b7ef971e85ce9ac734357de684 Mon Sep 17 00:00:00 2001
From: zhong-al <74470739+zhong-al@users.noreply.github.com>
Date: Tue, 15 Oct 2024 22:14:49 -0400
Subject: [PATCH 09/20] Make imshow optional, update docs

---
 README.md                          |  2 +-
 src/kabr_tools/detector2cvat.py    | 13 ++++++++++---
 src/kabr_tools/player.py           | 27 +++++++++++++++++----------
 src/kabr_tools/tracks_extractor.py |  4 ++--
 4 files changed, 30 insertions(+), 16 deletions(-)

diff --git a/README.md b/README.md
index 367f4be..559d937 100644
--- a/README.md
+++ b/README.md
@@ -75,7 +75,7 @@ You may use [YOLO](https://docs.ultralytics.com/) to automatically perform detec
 Detect objects with Ultralytics YOLO detections, apply SORT tracking and convert tracks to CVAT format.
 
 ```
-detector2cvat --video path_to_videos --save path_to_save
+detector2cvat --video path_to_videos --save path_to_save [--imshow]
 ```
 
 
diff --git a/src/kabr_tools/detector2cvat.py b/src/kabr_tools/detector2cvat.py
index 58b2f84..52df627 100644
--- a/src/kabr_tools/detector2cvat.py
+++ b/src/kabr_tools/detector2cvat.py
@@ -8,7 +8,7 @@
 from kabr_tools.utils.draw import Draw
 
 
-def detector2cvat(path_to_videos: str, path_to_save: str) -> None:
+def detector2cvat(path_to_videos: str, path_to_save: str, show: bool) -> None:
     videos = []
 
     for root, dirs, files in os.walk(path_to_videos):
@@ -76,7 +76,9 @@ def detector2cvat(path_to_videos: str, path_to_save: str) -> None:
 
                     cv2.putText(visualization, f"Frame: {index}", (50, 50), cv2.FONT_HERSHEY_SIMPLEX,
                                 0.8, (255, 255, 255), 3, cv2.LINE_AA)
-                    cv2.imshow("detector2cvat", cv2.resize(visualization, (int(width // 2.5), int(height // 2.5))))
+                    if show:
+                        cv2.imshow("detector2cvat", cv2.resize(
+                            visualization, (int(width // 2.5), int(height // 2.5))))
                     vw.write(visualization)
                     key = cv2.waitKey(1)
                     index += 1
@@ -110,12 +112,17 @@ def parse_args() -> argparse.Namespace:
         help="path to save output xml & mp4 files",
         required=True
     )
+    local_parser.add_argument(
+        "--imshow",
+        action="store_true",
+        help="flag to display detector's visualization"
+    )
     return local_parser.parse_args()
 
 
 def main() -> None:
     args = parse_args()
-    detector2cvat(args.video, args.save)
+    detector2cvat(args.video, args.save, args.imshow)
 
 
 if __name__ == "__main__":
diff --git a/src/kabr_tools/player.py b/src/kabr_tools/player.py
index 28cb460..6c4a83f 100644
--- a/src/kabr_tools/player.py
+++ b/src/kabr_tools/player.py
@@ -134,7 +134,7 @@ def hotkey(key: int) -> None:
 
                     vc.set(cv2.CAP_PROP_POS_FRAMES, metadata["tracks"][current][index])
 
-def player(folder: str, save: bool) -> None:
+def player(folder: str, save: bool, show: bool) -> None:
     name = folder.split("/")[-1].split('|')[-1]
 
     metadata_path = f"{folder}/metadata/{name}_metadata.json"
@@ -216,9 +216,11 @@ def player(folder: str, save: bool) -> None:
             cv2.setTrackbarPos(name, "TrackPlayer", index)
             cv2.putText(visualization, f"Frame: {index}", (50, 50), cv2.FONT_HERSHEY_SIMPLEX,
                         0.8, (255, 255, 255), 3, cv2.LINE_AA)
-
-            cv2.imshow("TrackPlayer", cv2.resize(visualization, (int(target_width // 2.5), int(target_height // 2.5)),
-                                                 interpolation=cv2.INTER_AREA))
+            if show:
+                cv2.imshow("TrackPlayer",
+                           cv2.resize(visualization,
+                                      (int(target_width // 2.5), int(target_height // 2.5)),
+                                      interpolation=cv2.INTER_AREA))
 
             if save:
                 vw.write(visualization)
@@ -276,22 +278,27 @@ def player(folder: str, save: bool) -> None:
 def parse_args() -> argparse.Namespace:
     local_parser = argparse.ArgumentParser()
     local_parser.add_argument(
-        '--folder',
+        "--folder",
         type=str,
-        help='path to folder with metadata and actions',
+        help="path to folder with metadata and actions",
         required=True
     )
     local_parser.add_argument(
-        '--save',
-        action='store_true',
-        help='Flag to save video'
+        "--save",
+        action="store_true",
+        help="flag to save video"
+    )
+    local_parser.add_argument(
+        "--imshow",
+        action="store_true",
+        help="flag to display detector's visualization"
     )
     return local_parser.parse_args()
 
 
 def main() -> None:
     args = parse_args()
-    player(args.folder, args.save)
+    player(args.folder, args.save, args.imshow)
 
 
 if __name__ == "__main__":
diff --git a/src/kabr_tools/tracks_extractor.py b/src/kabr_tools/tracks_extractor.py
index 6fbab25..112ac8e 100644
--- a/src/kabr_tools/tracks_extractor.py
+++ b/src/kabr_tools/tracks_extractor.py
@@ -225,12 +225,12 @@ def parse_args() -> argparse.Namespace:
     local_parser.add_argument(
         "--tracking",
         action="store_true",
-        help="Flag to use external tracker instead of CVAT tracks"
+        help="flag to use external tracker instead of CVAT tracks"
     )
     local_parser.add_argument(
         "--imshow",
         action="store_true",
-        help="Flag to display tracks\' visualization"
+        help="flag to display tracks\' visualization"
     )
     return local_parser.parse_args()
 

From 9405859b1db55d03806714b7dbafc587d84fead2 Mon Sep 17 00:00:00 2001
From: zhong-al <74470739+zhong-al@users.noreply.github.com>
Date: Wed, 16 Oct 2024 11:08:02 -0400
Subject: [PATCH 10/20] Find frames per track

---
 src/kabr_tools/miniscene2behavior.py | 13 ++++++-------
 1 file changed, 6 insertions(+), 7 deletions(-)

diff --git a/src/kabr_tools/miniscene2behavior.py b/src/kabr_tools/miniscene2behavior.py
index ad7b6ed..ed3f090 100644
--- a/src/kabr_tools/miniscene2behavior.py
+++ b/src/kabr_tools/miniscene2behavior.py
@@ -114,22 +114,21 @@ def annotate_miniscene(cfg: CfgNode, model: torch.nn.Module,
 
     # find all tracks
     tracks = []
+    frames = {}
     for track in root.iterfind("track"):
         track_id = track.attrib["id"]
         tracks.append(track_id)
+        frames[track_id] = []
 
-    # find all frames
-    # TODO: rewrite - some tracks may have different frames
-    assert len(tracks) > 0, "No tracks found in track file"
-    frames = []
-    for box in track.iterfind("box"):
-        frames.append(int(box.attrib["frame"]))
+        # find all frames
+        for box in track.iterfind("box"):
+            frames[track_id].append(int(box.attrib["frame"]))
 
     # run model on miniscene
     for track in tracks:
         video_file = f"{miniscene_path}/{track}.mp4"
         cap = cv2.VideoCapture(video_file)
-        for frame in tqdm(frames, desc=f"{track} frames"):
+        for frame in tqdm(frames[track], desc=f"{track} frames"):
             inputs = get_input_clip(cap, cfg, frame)
 
             if cfg.NUM_GPUS:

From d61b06abd33e5639aaf3ce0b71915fbaa501a240 Mon Sep 17 00:00:00 2001
From: zhong-al <74470739+zhong-al@users.noreply.github.com>
Date: Wed, 16 Oct 2024 16:56:30 -0400
Subject: [PATCH 11/20] Account for track extraction

---
 src/kabr_tools/miniscene2behavior.py | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/src/kabr_tools/miniscene2behavior.py b/src/kabr_tools/miniscene2behavior.py
index ed3f090..0ebb036 100644
--- a/src/kabr_tools/miniscene2behavior.py
+++ b/src/kabr_tools/miniscene2behavior.py
@@ -128,8 +128,9 @@ def annotate_miniscene(cfg: CfgNode, model: torch.nn.Module,
     for track in tracks:
         video_file = f"{miniscene_path}/{track}.mp4"
         cap = cv2.VideoCapture(video_file)
+        start_frame = frames[track][0]
         for frame in tqdm(frames[track], desc=f"{track} frames"):
-            inputs = get_input_clip(cap, cfg, frame)
+            inputs = get_input_clip(cap, cfg, frame - start_frame)
 
             if cfg.NUM_GPUS:
                 # transfer the data to the current GPU device.

From a144cb1da96971be0a2e2c68d6443f32bc9cf233 Mon Sep 17 00:00:00 2001
From: zhong-al <74470739+zhong-al@users.noreply.github.com>
Date: Thu, 17 Oct 2024 17:52:01 -0400
Subject: [PATCH 12/20] Add check to miniscene2behavior

---
 src/kabr_tools/miniscene2behavior.py | 11 ++++++++++-
 1 file changed, 10 insertions(+), 1 deletion(-)

diff --git a/src/kabr_tools/miniscene2behavior.py b/src/kabr_tools/miniscene2behavior.py
index 0ebb036..c012a4f 100644
--- a/src/kabr_tools/miniscene2behavior.py
+++ b/src/kabr_tools/miniscene2behavior.py
@@ -19,12 +19,16 @@ def get_input_clip(cap: cv2.VideoCapture, cfg: CfgNode, keyframe_idx: int) -> li
     # https://github.com/facebookresearch/SlowFast/blob/bac7b672f40d44166a84e8c51d1a5ba367ace816/slowfast/visualization/ava_demo_precomputed_boxes.py
     seq_length = cfg.DATA.NUM_FRAMES * cfg.DATA.SAMPLING_RATE
     total_frames = int(cap.get(cv2.CAP_PROP_FRAME_COUNT))
+    assert keyframe_idx < total_frames, f"keyframe_idx: {keyframe_idx}" \
+        f">= total_frames: {total_frames}"
     seq = get_sequence(
         keyframe_idx,
         seq_length // 2,
         cfg.DATA.SAMPLING_RATE,
         total_frames,
     )
+    # TODO: remove after debugging
+    print(seq)
     clip = []
     for frame_idx in seq:
         cap.set(cv2.CAP_PROP_POS_FRAMES, frame_idx)
@@ -130,7 +134,11 @@ def annotate_miniscene(cfg: CfgNode, model: torch.nn.Module,
         cap = cv2.VideoCapture(video_file)
         start_frame = frames[track][0]
         for frame in tqdm(frames[track], desc=f"{track} frames"):
-            inputs = get_input_clip(cap, cfg, frame - start_frame)
+            try:
+                inputs = get_input_clip(cap, cfg, frame - start_frame)
+            except AssertionError as e:
+                print(e)
+                break
 
             if cfg.NUM_GPUS:
                 # transfer the data to the current GPU device.
@@ -153,6 +161,7 @@ def annotate_miniscene(cfg: CfgNode, model: torch.nn.Module,
             if frame % 20 == 0:
                 pd.DataFrame(label_data).to_csv(
                     output_path, sep=" ", index=False)
+        cap.release()
     pd.DataFrame(label_data).to_csv(output_path, sep=" ", index=False)
 
 

From 2787bad23dc803246446129c9f4009b37d68a4e5 Mon Sep 17 00:00:00 2001
From: zhong-al <74470739+zhong-al@users.noreply.github.com>
Date: Thu, 17 Oct 2024 19:30:07 -0400
Subject: [PATCH 13/20] Print more debug

---
 src/kabr_tools/miniscene2behavior.py | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

diff --git a/src/kabr_tools/miniscene2behavior.py b/src/kabr_tools/miniscene2behavior.py
index c012a4f..6a0db17 100644
--- a/src/kabr_tools/miniscene2behavior.py
+++ b/src/kabr_tools/miniscene2behavior.py
@@ -28,7 +28,7 @@ def get_input_clip(cap: cv2.VideoCapture, cfg: CfgNode, keyframe_idx: int) -> li
         total_frames,
     )
     # TODO: remove after debugging
-    print(seq)
+    print(keyframe_idx, seq[0], seq[-1], total_frames)
     clip = []
     for frame_idx in seq:
         cap.set(cv2.CAP_PROP_POS_FRAMES, frame_idx)
@@ -143,8 +143,8 @@ def annotate_miniscene(cfg: CfgNode, model: torch.nn.Module,
             if cfg.NUM_GPUS:
                 # transfer the data to the current GPU device.
                 if isinstance(inputs, (list,)):
-                    for i in range(len(inputs)):
-                        inputs[i] = inputs[i].cuda(non_blocking=True)
+                    for i, input_clip in enumerate(inputs):
+                        inputs[i] = input_clip.cuda(non_blocking=True)
                 else:
                     inputs = inputs.cuda(non_blocking=True)
 

From 488e0fe4f7350e3a3ffb9888a959b13e51c4ebce Mon Sep 17 00:00:00 2001
From: zhong-al <74470739+zhong-al@users.noreply.github.com>
Date: Thu, 17 Oct 2024 20:24:33 -0400
Subject: [PATCH 14/20] Use index because track frames can be noncontiguous

---
 src/kabr_tools/miniscene2behavior.py | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

diff --git a/src/kabr_tools/miniscene2behavior.py b/src/kabr_tools/miniscene2behavior.py
index 6a0db17..430ecdd 100644
--- a/src/kabr_tools/miniscene2behavior.py
+++ b/src/kabr_tools/miniscene2behavior.py
@@ -132,10 +132,10 @@ def annotate_miniscene(cfg: CfgNode, model: torch.nn.Module,
     for track in tracks:
         video_file = f"{miniscene_path}/{track}.mp4"
         cap = cv2.VideoCapture(video_file)
-        start_frame = frames[track][0]
-        for frame in tqdm(frames[track], desc=f"{track} frames"):
+        print(f'{track=}')
+        for index, frame in tqdm(enumerate(frames[track]), desc=f'{track} frames'):
             try:
-                inputs = get_input_clip(cap, cfg, frame - start_frame)
+                inputs = get_input_clip(cap, cfg, index)
             except AssertionError as e:
                 print(e)
                 break

From da5b0b8e463646b8dd7009d1058fda186885d77e Mon Sep 17 00:00:00 2001
From: zhong-al <74470739+zhong-al@users.noreply.github.com>
Date: Fri, 25 Oct 2024 17:16:26 -0400
Subject: [PATCH 15/20] Fix spacing + tqdm bar

---
 src/kabr_tools/miniscene2behavior.py | 10 +++++-----
 1 file changed, 5 insertions(+), 5 deletions(-)

diff --git a/src/kabr_tools/miniscene2behavior.py b/src/kabr_tools/miniscene2behavior.py
index 430ecdd..61630df 100644
--- a/src/kabr_tools/miniscene2behavior.py
+++ b/src/kabr_tools/miniscene2behavior.py
@@ -20,15 +20,14 @@ def get_input_clip(cap: cv2.VideoCapture, cfg: CfgNode, keyframe_idx: int) -> li
     seq_length = cfg.DATA.NUM_FRAMES * cfg.DATA.SAMPLING_RATE
     total_frames = int(cap.get(cv2.CAP_PROP_FRAME_COUNT))
     assert keyframe_idx < total_frames, f"keyframe_idx: {keyframe_idx}" \
-        f">= total_frames: {total_frames}"
+        f" >= total_frames: {total_frames}"
     seq = get_sequence(
         keyframe_idx,
         seq_length // 2,
         cfg.DATA.SAMPLING_RATE,
         total_frames,
     )
-    # TODO: remove after debugging
-    print(keyframe_idx, seq[0], seq[-1], total_frames)
+
     clip = []
     for frame_idx in seq:
         cap.set(cv2.CAP_PROP_POS_FRAMES, frame_idx)
@@ -132,13 +131,14 @@ def annotate_miniscene(cfg: CfgNode, model: torch.nn.Module,
     for track in tracks:
         video_file = f"{miniscene_path}/{track}.mp4"
         cap = cv2.VideoCapture(video_file)
-        print(f'{track=}')
-        for index, frame in tqdm(enumerate(frames[track]), desc=f'{track} frames'):
+        index = 0
+        for frame in tqdm(frames[track], desc=f'{track} frames'):
             try:
                 inputs = get_input_clip(cap, cfg, index)
             except AssertionError as e:
                 print(e)
                 break
+            index += 1
 
             if cfg.NUM_GPUS:
                 # transfer the data to the current GPU device.

From 49aaa4b7908bfb6bf0f9f4c765d7147dde515702 Mon Sep 17 00:00:00 2001
From: zhong-al <74470739+zhong-al@users.noreply.github.com>
Date: Fri, 25 Oct 2024 17:17:55 -0400
Subject: [PATCH 16/20] Set random seeds

---
 src/kabr_tools/miniscene2behavior.py | 5 +++++
 1 file changed, 5 insertions(+)

diff --git a/src/kabr_tools/miniscene2behavior.py b/src/kabr_tools/miniscene2behavior.py
index 61630df..04bfabd 100644
--- a/src/kabr_tools/miniscene2behavior.py
+++ b/src/kabr_tools/miniscene2behavior.py
@@ -2,6 +2,7 @@
 import argparse
 import torch
 from lxml import etree
+import numpy as np
 import pandas as pd
 import cv2
 from tqdm import tqdm
@@ -100,6 +101,10 @@ def create_model(config_path: str, checkpoint_path: str, gpu_num: int) -> tuple[
     cfg.OUTPUT_DIR = ""
     model = build.build_model(cfg)
 
+    # set random seeds
+    np.random.seed(cfg.RNG_SEED)
+    torch.manual_seed(cfg.RNG_SEED)
+
     # load model checkpoint
     cu.load_checkpoint(checkpoint_path, model, data_parallel=False)
 

From e77a5d15bdc9573cbff178bdbe2d41f3ab08d2a9 Mon Sep 17 00:00:00 2001
From: zhong-al <74470739+zhong-al@users.noreply.github.com>
Date: Fri, 22 Nov 2024 20:57:23 -0500
Subject: [PATCH 17/20] Add slowfast code

---
 src/kabr_tools/utils/slowfast/LICENSE     |  203 ++++
 src/kabr_tools/utils/slowfast/__init__.py |    0
 src/kabr_tools/utils/slowfast/cfg.py      | 1295 +++++++++++++++++++++
 src/kabr_tools/utils/slowfast/head.py     |  145 +++
 src/kabr_tools/utils/slowfast/norm.py     |  109 ++
 src/kabr_tools/utils/slowfast/resnet.py   |  926 +++++++++++++++
 src/kabr_tools/utils/slowfast/stem.py     |  321 +++++
 src/kabr_tools/utils/slowfast/utils.py    |  115 ++
 src/kabr_tools/utils/slowfast/x3d.py      |  352 ++++++
 9 files changed, 3466 insertions(+)
 create mode 100644 src/kabr_tools/utils/slowfast/LICENSE
 create mode 100644 src/kabr_tools/utils/slowfast/__init__.py
 create mode 100644 src/kabr_tools/utils/slowfast/cfg.py
 create mode 100644 src/kabr_tools/utils/slowfast/head.py
 create mode 100644 src/kabr_tools/utils/slowfast/norm.py
 create mode 100644 src/kabr_tools/utils/slowfast/resnet.py
 create mode 100644 src/kabr_tools/utils/slowfast/stem.py
 create mode 100644 src/kabr_tools/utils/slowfast/utils.py
 create mode 100644 src/kabr_tools/utils/slowfast/x3d.py

diff --git a/src/kabr_tools/utils/slowfast/LICENSE b/src/kabr_tools/utils/slowfast/LICENSE
new file mode 100644
index 0000000..32e386e
--- /dev/null
+++ b/src/kabr_tools/utils/slowfast/LICENSE
@@ -0,0 +1,203 @@
+Apache License
+Version 2.0, January 2004
+http://www.apache.org/licenses/
+
+TERMS AND CONDITIONS FOR USE, REPRODUCTION, AND DISTRIBUTION
+
+1. Definitions.
+
+"License" shall mean the terms and conditions for use, reproduction,
+and distribution as defined by Sections 1 through 9 of this document.
+
+"Licensor" shall mean the copyright owner or entity authorized by
+the copyright owner that is granting the License.
+
+"Legal Entity" shall mean the union of the acting entity and all
+other entities that control, are controlled by, or are under common
+control with that entity. For the purposes of this definition,
+"control" means (i) the power, direct or indirect, to cause the
+direction or management of such entity, whether by contract or
+otherwise, or (ii) ownership of fifty percent (50%) or more of the
+outstanding shares, or (iii) beneficial ownership of such entity.
+
+"You" (or "Your") shall mean an individual or Legal Entity
+exercising permissions granted by this License.
+
+"Source" form shall mean the preferred form for making modifications,
+including but not limited to software source code, documentation
+source, and configuration files.
+
+"Object" form shall mean any form resulting from mechanical
+transformation or translation of a Source form, including but
+not limited to compiled object code, generated documentation,
+and conversions to other media types.
+
+"Work" shall mean the work of authorship, whether in Source or
+Object form, made available under the License, as indicated by a
+copyright notice that is included in or attached to the work
+(an example is provided in the Appendix below).
+
+"Derivative Works" shall mean any work, whether in Source or Object
+form, that is based on (or derived from) the Work and for which the
+editorial revisions, annotations, elaborations, or other modifications
+represent, as a whole, an original work of authorship. For the purposes
+of this License, Derivative Works shall not include works that remain
+separable from, or merely link (or bind by name) to the interfaces of,
+the Work and Derivative Works thereof.
+
+"Contribution" shall mean any work of authorship, including
+the original version of the Work and any modifications or additions
+to that Work or Derivative Works thereof, that is intentionally
+submitted to Licensor for inclusion in the Work by the copyright owner
+or by an individual or Legal Entity authorized to submit on behalf of
+the copyright owner. For the purposes of this definition, "submitted"
+means any form of electronic, verbal, or written communication sent
+to the Licensor or its representatives, including but not limited to
+communication on electronic mailing lists, source code control systems,
+and issue tracking systems that are managed by, or on behalf of, the
+Licensor for the purpose of discussing and improving the Work, but
+excluding communication that is conspicuously marked or otherwise
+designated in writing by the copyright owner as "Not a Contribution."
+
+"Contributor" shall mean Licensor and any individual or Legal Entity
+on behalf of whom a Contribution has been received by Licensor and
+subsequently incorporated within the Work.
+
+2. Grant of Copyright License. Subject to the terms and conditions of
+this License, each Contributor hereby grants to You a perpetual,
+worldwide, non-exclusive, no-charge, royalty-free, irrevocable
+copyright license to reproduce, prepare Derivative Works of,
+publicly display, publicly perform, sublicense, and distribute the
+Work and such Derivative Works in Source or Object form.
+
+3. Grant of Patent License. Subject to the terms and conditions of
+this License, each Contributor hereby grants to You a perpetual,
+worldwide, non-exclusive, no-charge, royalty-free, irrevocable
+(except as stated in this section) patent license to make, have made,
+use, offer to sell, sell, import, and otherwise transfer the Work,
+where such license applies only to those patent claims licensable
+by such Contributor that are necessarily infringed by their
+Contribution(s) alone or by combination of their Contribution(s)
+with the Work to which such Contribution(s) was submitted. If You
+institute patent litigation against any entity (including a
+cross-claim or counterclaim in a lawsuit) alleging that the Work
+or a Contribution incorporated within the Work constitutes direct
+or contributory patent infringement, then any patent licenses
+granted to You under this License for that Work shall terminate
+as of the date such litigation is filed.
+
+4. Redistribution. You may reproduce and distribute copies of the
+Work or Derivative Works thereof in any medium, with or without
+modifications, and in Source or Object form, provided that You
+meet the following conditions:
+
+(a) You must give any other recipients of the Work or
+Derivative Works a copy of this License; and
+
+(b) You must cause any modified files to carry prominent notices
+stating that You changed the files; and
+
+(c) You must retain, in the Source form of any Derivative Works
+that You distribute, all copyright, patent, trademark, and
+attribution notices from the Source form of the Work,
+excluding those notices that do not pertain to any part of
+the Derivative Works; and
+
+(d) If the Work includes a "NOTICE" text file as part of its
+distribution, then any Derivative Works that You distribute must
+include a readable copy of the attribution notices contained
+within such NOTICE file, excluding those notices that do not
+pertain to any part of the Derivative Works, in at least one
+of the following places: within a NOTICE text file distributed
+as part of the Derivative Works; within the Source form or
+documentation, if provided along with the Derivative Works; or,
+within a display generated by the Derivative Works, if and
+wherever such third-party notices normally appear. The contents
+of the NOTICE file are for informational purposes only and
+do not modify the License. You may add Your own attribution
+notices within Derivative Works that You distribute, alongside
+or as an addendum to the NOTICE text from the Work, provided
+that such additional attribution notices cannot be construed
+as modifying the License.
+
+You may add Your own copyright statement to Your modifications and
+may provide additional or different license terms and conditions
+for use, reproduction, or distribution of Your modifications, or
+for any such Derivative Works as a whole, provided Your use,
+reproduction, and distribution of the Work otherwise complies with
+the conditions stated in this License.
+
+5. Submission of Contributions. Unless You explicitly state otherwise,
+any Contribution intentionally submitted for inclusion in the Work
+by You to the Licensor shall be under the terms and conditions of
+this License, without any additional terms or conditions.
+Notwithstanding the above, nothing herein shall supersede or modify
+the terms of any separate license agreement you may have executed
+with Licensor regarding such Contributions.
+
+6. Trademarks. This License does not grant permission to use the trade
+names, trademarks, service marks, or product names of the Licensor,
+except as required for reasonable and customary use in describing the
+origin of the Work and reproducing the content of the NOTICE file.
+
+7. Disclaimer of Warranty. Unless required by applicable law or
+agreed to in writing, Licensor provides the Work (and each
+Contributor provides its Contributions) on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or
+implied, including, without limitation, any warranties or conditions
+of TITLE, NON-INFRINGEMENT, MERCHANTABILITY, or FITNESS FOR A
+PARTICULAR PURPOSE. You are solely responsible for determining the
+appropriateness of using or redistributing the Work and assume any
+risks associated with Your exercise of permissions under this License.
+
+8. Limitation of Liability. In no event and under no legal theory,
+whether in tort (including negligence), contract, or otherwise,
+unless required by applicable law (such as deliberate and grossly
+negligent acts) or agreed to in writing, shall any Contributor be
+liable to You for damages, including any direct, indirect, special,
+incidental, or consequential damages of any character arising as a
+result of this License or out of the use or inability to use the
+Work (including but not limited to damages for loss of goodwill,
+work stoppage, computer failure or malfunction, or any and all
+other commercial damages or losses), even if such Contributor
+has been advised of the possibility of such damages.
+
+9. Accepting Warranty or Additional Liability. While redistributing
+the Work or Derivative Works thereof, You may choose to offer,
+and charge a fee for, acceptance of support, warranty, indemnity,
+or other liability obligations and/or rights consistent with this
+License. However, in accepting such obligations, You may act only
+on Your own behalf and on Your sole responsibility, not on behalf
+of any other Contributor, and only if You agree to indemnify,
+defend, and hold each Contributor harmless for any liability
+incurred by, or claims asserted against, such Contributor by reason
+of your accepting any such warranty or additional liability.
+
+END OF TERMS AND CONDITIONS
+
+APPENDIX: How to apply the Apache License to your work.
+
+To apply the Apache License to your work, attach the following
+boilerplate notice, with the fields enclosed by brackets "[]"
+replaced with your own identifying information. (Don't include
+the brackets!)  The text should be enclosed in the appropriate
+comment syntax for the file format. We also recommend that a
+file or class name and description of purpose be included on the
+same "printed page" as the copyright notice for easier
+identification within third-party archives.
+
+Copyright 2019, Facebook, Inc
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+
+https://github.com/facebookresearch/SlowFast/blob/main/LICENSE
\ No newline at end of file
diff --git a/src/kabr_tools/utils/slowfast/__init__.py b/src/kabr_tools/utils/slowfast/__init__.py
new file mode 100644
index 0000000..e69de29
diff --git a/src/kabr_tools/utils/slowfast/cfg.py b/src/kabr_tools/utils/slowfast/cfg.py
new file mode 100644
index 0000000..6ef6c20
--- /dev/null
+++ b/src/kabr_tools/utils/slowfast/cfg.py
@@ -0,0 +1,1295 @@
+# Copyright (c) Facebook, Inc. and its affiliates. All Rights Reserved.
+
+"""Configs."""
+import math
+
+from fvcore.common.config import CfgNode
+
+# -----------------------------------------------------------------------------
+# Config definition
+# -----------------------------------------------------------------------------
+_C = CfgNode()
+
+# -----------------------------------------------------------------------------
+# Contrastive Model (for MoCo, SimCLR, SwAV, BYOL)
+# -----------------------------------------------------------------------------
+
+_C.CONTRASTIVE = CfgNode()
+
+# temperature used for contrastive losses
+_C.CONTRASTIVE.T = 0.07
+
+# output dimension for the loss
+_C.CONTRASTIVE.DIM = 128
+
+# number of training samples (for kNN bank)
+_C.CONTRASTIVE.LENGTH = 239975
+
+# the length of MoCo's and MemBanks' queues
+_C.CONTRASTIVE.QUEUE_LEN = 65536
+
+# momentum for momentum encoder updates
+_C.CONTRASTIVE.MOMENTUM = 0.5
+
+# wether to anneal momentum to value above with cosine schedule
+_C.CONTRASTIVE.MOMENTUM_ANNEALING = False
+
+# either memorybank, moco, simclr, byol, swav
+_C.CONTRASTIVE.TYPE = "mem"
+
+# wether to interpolate memorybank in time
+_C.CONTRASTIVE.INTERP_MEMORY = False
+
+# 1d or 2d (+temporal) memory
+_C.CONTRASTIVE.MEM_TYPE = "1d"
+
+# number of classes for online kNN evaluation
+_C.CONTRASTIVE.NUM_CLASSES_DOWNSTREAM = 400
+
+# use an MLP projection with these num layers
+_C.CONTRASTIVE.NUM_MLP_LAYERS = 1
+
+# dimension of projection and predictor MLPs
+_C.CONTRASTIVE.MLP_DIM = 2048
+
+# use BN in projection/prediction MLP
+_C.CONTRASTIVE.BN_MLP = False
+
+# use synchronized BN in projection/prediction MLP
+_C.CONTRASTIVE.BN_SYNC_MLP = False
+
+# shuffle BN only locally vs. across machines
+_C.CONTRASTIVE.LOCAL_SHUFFLE_BN = True
+
+# Wether to fill multiple clips (or just the first) into queue
+_C.CONTRASTIVE.MOCO_MULTI_VIEW_QUEUE = False
+
+# if sampling multiple clips per vid they need to be at least min frames apart
+_C.CONTRASTIVE.DELTA_CLIPS_MIN = -math.inf
+
+# if sampling multiple clips per vid they can be max frames apart
+_C.CONTRASTIVE.DELTA_CLIPS_MAX = math.inf
+
+# if non empty, use predictors with depth specified
+_C.CONTRASTIVE.PREDICTOR_DEPTHS = []
+
+# Wether to sequentially process multiple clips (=lower mem usage) or batch them
+_C.CONTRASTIVE.SEQUENTIAL = False
+
+# Wether to perform SimCLR loss across machines (or only locally)
+_C.CONTRASTIVE.SIMCLR_DIST_ON = True
+
+# Length of queue used in SwAV
+_C.CONTRASTIVE.SWAV_QEUE_LEN = 0
+
+# Wether to run online kNN evaluation during training
+_C.CONTRASTIVE.KNN_ON = True
+
+
+# ---------------------------------------------------------------------------- #
+# Batch norm options
+# ---------------------------------------------------------------------------- #
+_C.BN = CfgNode()
+
+# Precise BN stats.
+_C.BN.USE_PRECISE_STATS = False
+
+# Number of samples use to compute precise bn.
+_C.BN.NUM_BATCHES_PRECISE = 200
+
+# Weight decay value that applies on BN.
+_C.BN.WEIGHT_DECAY = 0.0
+
+# Norm type, options include `batchnorm`, `sub_batchnorm`, `sync_batchnorm`
+_C.BN.NORM_TYPE = "batchnorm"
+
+# Parameter for SubBatchNorm, where it splits the batch dimension into
+# NUM_SPLITS splits, and run BN on each of them separately independently.
+_C.BN.NUM_SPLITS = 1
+
+# Parameter for NaiveSyncBatchNorm, where the stats across `NUM_SYNC_DEVICES`
+# devices will be synchronized. `NUM_SYNC_DEVICES` cannot be larger than number of
+# devices per machine; if global sync is desired, set `GLOBAL_SYNC`.
+# By default ONLY applies to NaiveSyncBatchNorm3d; consider also setting
+# CONTRASTIVE.BN_SYNC_MLP if appropriate.
+_C.BN.NUM_SYNC_DEVICES = 1
+
+# Parameter for NaiveSyncBatchNorm. Setting `GLOBAL_SYNC` to True synchronizes
+# stats across all devices, across all machines; in this case, `NUM_SYNC_DEVICES`
+# must be set to None.
+# By default ONLY applies to NaiveSyncBatchNorm3d; consider also setting
+# CONTRASTIVE.BN_SYNC_MLP if appropriate.
+_C.BN.GLOBAL_SYNC = False
+
+# ---------------------------------------------------------------------------- #
+# Training options.
+# ---------------------------------------------------------------------------- #
+_C.TRAIN = CfgNode()
+
+# If True Train the model, else skip training.
+_C.TRAIN.ENABLE = True
+
+# Kill training if loss explodes over this ratio from the previous 5 measurements.
+# Only enforced if > 0.0
+_C.TRAIN.KILL_LOSS_EXPLOSION_FACTOR = 0.0
+
+# Dataset.
+_C.TRAIN.DATASET = "kinetics"
+
+# Total mini-batch size.
+_C.TRAIN.BATCH_SIZE = 64
+
+# Evaluate model on test data every eval period epochs.
+_C.TRAIN.EVAL_PERIOD = 10
+
+# Save model checkpoint every checkpoint period epochs.
+_C.TRAIN.CHECKPOINT_PERIOD = 10
+
+# Resume training from the latest checkpoint in the output directory.
+_C.TRAIN.AUTO_RESUME = True
+
+# Path to the checkpoint to load the initial weight.
+_C.TRAIN.CHECKPOINT_FILE_PATH = ""
+
+# Checkpoint types include `caffe2` or `pytorch`.
+_C.TRAIN.CHECKPOINT_TYPE = "pytorch"
+
+# If True, perform inflation when loading checkpoint.
+_C.TRAIN.CHECKPOINT_INFLATE = False
+
+# If True, reset epochs when loading checkpoint.
+_C.TRAIN.CHECKPOINT_EPOCH_RESET = False
+
+# If set, clear all layer names according to the pattern provided.
+_C.TRAIN.CHECKPOINT_CLEAR_NAME_PATTERN = ()  # ("backbone.",)
+
+# If True, use FP16 for activations
+_C.TRAIN.MIXED_PRECISION = False
+
+# if True, inflate some params from imagenet model.
+_C.TRAIN.CHECKPOINT_IN_INIT = False
+
+# ---------------------------------------------------------------------------- #
+# Augmentation options.
+# ---------------------------------------------------------------------------- #
+_C.AUG = CfgNode()
+
+# Whether to enable randaug.
+_C.AUG.ENABLE = False
+
+# Number of repeated augmentations to used during training.
+# If this is greater than 1, then the actual batch size is
+# TRAIN.BATCH_SIZE * AUG.NUM_SAMPLE.
+_C.AUG.NUM_SAMPLE = 1
+
+# Not used if using randaug.
+_C.AUG.COLOR_JITTER = 0.4
+
+# RandAug parameters.
+_C.AUG.AA_TYPE = "rand-m9-mstd0.5-inc1"
+
+# Interpolation method.
+_C.AUG.INTERPOLATION = "bicubic"
+
+# Probability of random erasing.
+_C.AUG.RE_PROB = 0.25
+
+# Random erasing mode.
+_C.AUG.RE_MODE = "pixel"
+
+# Random erase count.
+_C.AUG.RE_COUNT = 1
+
+# Do not random erase first (clean) augmentation split.
+_C.AUG.RE_SPLIT = False
+
+# Whether to generate input mask during image processing.
+_C.AUG.GEN_MASK_LOADER = False
+
+# If True, masking mode is "tube". Default is "cube".
+_C.AUG.MASK_TUBE = False
+
+# If True, masking mode is "frame". Default is "cube".
+_C.AUG.MASK_FRAMES = False
+
+# The size of generated masks.
+_C.AUG.MASK_WINDOW_SIZE = [8, 7, 7]
+
+# The ratio of masked tokens out of all tokens. Also applies to MViT supervised training
+_C.AUG.MASK_RATIO = 0.0
+
+# The maximum number of a masked block. None means no maximum limit. (Used only in image MaskFeat.)
+_C.AUG.MAX_MASK_PATCHES_PER_BLOCK = None
+
+# ---------------------------------------------------------------------------- #
+# Masked pretraining visualization options.
+# ---------------------------------------------------------------------------- #
+_C.VIS_MASK = CfgNode()
+
+# Whether to do visualization.
+_C.VIS_MASK.ENABLE = False
+
+# ---------------------------------------------------------------------------- #
+# MipUp options.
+# ---------------------------------------------------------------------------- #
+_C.MIXUP = CfgNode()
+
+# Whether to use mixup.
+_C.MIXUP.ENABLE = False
+
+# Mixup alpha.
+_C.MIXUP.ALPHA = 0.8
+
+# Cutmix alpha.
+_C.MIXUP.CUTMIX_ALPHA = 1.0
+
+# Probability of performing mixup or cutmix when either/both is enabled.
+_C.MIXUP.PROB = 1.0
+
+# Probability of switching to cutmix when both mixup and cutmix enabled.
+_C.MIXUP.SWITCH_PROB = 0.5
+
+# Label smoothing.
+_C.MIXUP.LABEL_SMOOTH_VALUE = 0.1
+
+# ---------------------------------------------------------------------------- #
+# Testing options
+# ---------------------------------------------------------------------------- #
+_C.TEST = CfgNode()
+
+# If True test the model, else skip the testing.
+_C.TEST.ENABLE = True
+
+# Dataset for testing.
+_C.TEST.DATASET = "kinetics"
+
+# Total mini-batch size
+_C.TEST.BATCH_SIZE = 8
+
+# Path to the checkpoint to load the initial weight.
+_C.TEST.CHECKPOINT_FILE_PATH = ""
+
+# Number of clips to sample from a video uniformly for aggregating the
+# prediction results.
+_C.TEST.NUM_ENSEMBLE_VIEWS = 10
+
+# Number of crops to sample from a frame spatially for aggregating the
+# prediction results.
+_C.TEST.NUM_SPATIAL_CROPS = 3
+
+# Checkpoint types include `caffe2` or `pytorch`.
+_C.TEST.CHECKPOINT_TYPE = "pytorch"
+# Path to saving prediction results file.
+_C.TEST.SAVE_RESULTS_PATH = ""
+
+_C.TEST.NUM_TEMPORAL_CLIPS = []
+# -----------------------------------------------------------------------------
+# ResNet options
+# -----------------------------------------------------------------------------
+_C.RESNET = CfgNode()
+
+# Transformation function.
+_C.RESNET.TRANS_FUNC = "bottleneck_transform"
+
+# Number of groups. 1 for ResNet, and larger than 1 for ResNeXt).
+_C.RESNET.NUM_GROUPS = 1
+
+# Width of each group (64 -> ResNet; 4 -> ResNeXt).
+_C.RESNET.WIDTH_PER_GROUP = 64
+
+# Apply relu in a inplace manner.
+_C.RESNET.INPLACE_RELU = True
+
+# Apply stride to 1x1 conv.
+_C.RESNET.STRIDE_1X1 = False
+
+#  If true, initialize the gamma of the final BN of each block to zero.
+_C.RESNET.ZERO_INIT_FINAL_BN = False
+
+#  If true, initialize the final conv layer of each block to zero.
+_C.RESNET.ZERO_INIT_FINAL_CONV = False
+
+# Number of weight layers.
+_C.RESNET.DEPTH = 50
+
+# If the current block has more than NUM_BLOCK_TEMP_KERNEL blocks, use temporal
+# kernel of 1 for the rest of the blocks.
+_C.RESNET.NUM_BLOCK_TEMP_KERNEL = [[3], [4], [6], [3]]
+
+# Size of stride on different res stages.
+_C.RESNET.SPATIAL_STRIDES = [[1], [2], [2], [2]]
+
+# Size of dilation on different res stages.
+_C.RESNET.SPATIAL_DILATIONS = [[1], [1], [1], [1]]
+
+# ---------------------------------------------------------------------------- #
+# X3D  options
+# See https://arxiv.org/abs/2004.04730 for details about X3D Networks.
+# ---------------------------------------------------------------------------- #
+_C.X3D = CfgNode()
+
+# Width expansion factor.
+_C.X3D.WIDTH_FACTOR = 1.0
+
+# Depth expansion factor.
+_C.X3D.DEPTH_FACTOR = 1.0
+
+# Bottleneck expansion factor for the 3x3x3 conv.
+_C.X3D.BOTTLENECK_FACTOR = 1.0  #
+
+# Dimensions of the last linear layer before classificaiton.
+_C.X3D.DIM_C5 = 2048
+
+# Dimensions of the first 3x3 conv layer.
+_C.X3D.DIM_C1 = 12
+
+# Whether to scale the width of Res2, default is false.
+_C.X3D.SCALE_RES2 = False
+
+# Whether to use a BatchNorm (BN) layer before the classifier, default is false.
+_C.X3D.BN_LIN5 = False
+
+# Whether to use channelwise (=depthwise) convolution in the center (3x3x3)
+# convolution operation of the residual blocks.
+_C.X3D.CHANNELWISE_3x3x3 = True
+
+# -----------------------------------------------------------------------------
+# Nonlocal options
+# -----------------------------------------------------------------------------
+_C.NONLOCAL = CfgNode()
+
+# Index of each stage and block to add nonlocal layers.
+_C.NONLOCAL.LOCATION = [[[]], [[]], [[]], [[]]]
+
+# Number of group for nonlocal for each stage.
+_C.NONLOCAL.GROUP = [[1], [1], [1], [1]]
+
+# Instatiation to use for non-local layer.
+_C.NONLOCAL.INSTANTIATION = "dot_product"
+
+
+# Size of pooling layers used in Non-Local.
+_C.NONLOCAL.POOL = [
+    # Res2
+    [[1, 2, 2], [1, 2, 2]],
+    # Res3
+    [[1, 2, 2], [1, 2, 2]],
+    # Res4
+    [[1, 2, 2], [1, 2, 2]],
+    # Res5
+    [[1, 2, 2], [1, 2, 2]],
+]
+
+# -----------------------------------------------------------------------------
+# Model options
+# -----------------------------------------------------------------------------
+_C.MODEL = CfgNode()
+
+# Model architecture.
+_C.MODEL.ARCH = "slowfast"
+
+# Model name
+_C.MODEL.MODEL_NAME = "SlowFast"
+
+# The number of classes to predict for the model.
+_C.MODEL.NUM_CLASSES = 400
+
+# Loss function.
+_C.MODEL.LOSS_FUNC = "cross_entropy"
+
+# Model architectures that has one single pathway.
+_C.MODEL.SINGLE_PATHWAY_ARCH = [
+    "2d",
+    "c2d",
+    "i3d",
+    "slow",
+    "x3d",
+    "mvit",
+    "maskmvit",
+]
+
+# Model architectures that has multiple pathways.
+_C.MODEL.MULTI_PATHWAY_ARCH = ["slowfast"]
+
+# Dropout rate before final projection in the backbone.
+_C.MODEL.DROPOUT_RATE = 0.5
+
+# Randomly drop rate for Res-blocks, linearly increase from res2 to res5
+_C.MODEL.DROPCONNECT_RATE = 0.0
+
+# The std to initialize the fc layer(s).
+_C.MODEL.FC_INIT_STD = 0.01
+
+# Activation layer for the output head.
+_C.MODEL.HEAD_ACT = "softmax"
+
+# Activation checkpointing enabled or not to save GPU memory.
+_C.MODEL.ACT_CHECKPOINT = False
+
+# If True, detach the final fc layer from the network, by doing so, only the
+# final fc layer will be trained.
+_C.MODEL.DETACH_FINAL_FC = False
+
+# If True, frozen batch norm stats during training.
+_C.MODEL.FROZEN_BN = False
+
+# If True, AllReduce gradients are compressed to fp16
+_C.MODEL.FP16_ALLREDUCE = False
+
+
+# -----------------------------------------------------------------------------
+# MViT options
+# -----------------------------------------------------------------------------
+_C.MVIT = CfgNode()
+
+# Options include `conv`, `max`.
+_C.MVIT.MODE = "conv"
+
+# If True, perform pool before projection in attention.
+_C.MVIT.POOL_FIRST = False
+
+# If True, use cls embed in the network, otherwise don't use cls_embed in transformer.
+_C.MVIT.CLS_EMBED_ON = True
+
+# Kernel size for patchtification.
+_C.MVIT.PATCH_KERNEL = [3, 7, 7]
+
+# Stride size for patchtification.
+_C.MVIT.PATCH_STRIDE = [2, 4, 4]
+
+# Padding size for patchtification.
+_C.MVIT.PATCH_PADDING = [2, 4, 4]
+
+# If True, use 2d patch, otherwise use 3d patch.
+_C.MVIT.PATCH_2D = False
+
+# Base embedding dimension for the transformer.
+_C.MVIT.EMBED_DIM = 96
+
+# Base num of heads for the transformer.
+_C.MVIT.NUM_HEADS = 1
+
+# Dimension reduction ratio for the MLP layers.
+_C.MVIT.MLP_RATIO = 4.0
+
+# If use, use bias term in attention fc layers.
+_C.MVIT.QKV_BIAS = True
+
+# Drop path rate for the tranfomer.
+_C.MVIT.DROPPATH_RATE = 0.1
+
+# The initial value of layer scale gamma. Set 0.0 to disable layer scale.
+_C.MVIT.LAYER_SCALE_INIT_VALUE = 0.0
+
+# Depth of the transformer.
+_C.MVIT.DEPTH = 16
+
+# Normalization layer for the transformer. Only layernorm is supported now.
+_C.MVIT.NORM = "layernorm"
+
+# Dimension multiplication at layer i. If 2.0 is used, then the next block will increase
+# the dimension by 2 times. Format: [depth_i: mul_dim_ratio]
+_C.MVIT.DIM_MUL = []
+
+# Head number multiplication at layer i. If 2.0 is used, then the next block will
+# increase the number of heads by 2 times. Format: [depth_i: head_mul_ratio]
+_C.MVIT.HEAD_MUL = []
+
+# Stride size for the Pool KV at layer i.
+# Format: [[i, stride_t_i, stride_h_i, stride_w_i], ...,]
+_C.MVIT.POOL_KV_STRIDE = []
+
+# Initial stride size for KV at layer 1. The stride size will be further reduced with
+# the raio of MVIT.DIM_MUL. If will overwrite MVIT.POOL_KV_STRIDE if not None.
+_C.MVIT.POOL_KV_STRIDE_ADAPTIVE = None
+
+# Stride size for the Pool Q at layer i.
+# Format: [[i, stride_t_i, stride_h_i, stride_w_i], ...,]
+_C.MVIT.POOL_Q_STRIDE = []
+
+# If not None, overwrite the KV_KERNEL and Q_KERNEL size with POOL_KVQ_CONV_SIZ.
+# Otherwise the kernel_size is [s + 1 if s > 1 else s for s in stride_size].
+_C.MVIT.POOL_KVQ_KERNEL = None
+
+# If True, perform no decay on positional embedding and cls embedding.
+_C.MVIT.ZERO_DECAY_POS_CLS = True
+
+# If True, use norm after stem.
+_C.MVIT.NORM_STEM = False
+
+# If True, perform separate positional embedding.
+_C.MVIT.SEP_POS_EMBED = False
+
+# Dropout rate for the MViT backbone.
+_C.MVIT.DROPOUT_RATE = 0.0
+
+# If True, use absolute positional embedding.
+_C.MVIT.USE_ABS_POS = True
+
+# If True, use relative positional embedding for spatial dimentions
+_C.MVIT.REL_POS_SPATIAL = False
+
+# If True, use relative positional embedding for temporal dimentions
+_C.MVIT.REL_POS_TEMPORAL = False
+
+# If True, init rel with zero
+_C.MVIT.REL_POS_ZERO_INIT = False
+
+# If True, using Residual Pooling connection
+_C.MVIT.RESIDUAL_POOLING = False
+
+# Dim mul in qkv linear layers of attention block instead of MLP
+_C.MVIT.DIM_MUL_IN_ATT = False
+
+# If True, using separate linear layers for Q, K, V in attention blocks.
+_C.MVIT.SEPARATE_QKV = False
+
+# The initialization scale factor for the head parameters.
+_C.MVIT.HEAD_INIT_SCALE = 1.0
+
+# Whether to use the mean pooling of all patch tokens as the output.
+_C.MVIT.USE_MEAN_POOLING = False
+
+# If True, use frozen sin cos positional embedding.
+_C.MVIT.USE_FIXED_SINCOS_POS = False
+
+# -----------------------------------------------------------------------------
+# Masked pretraining options
+# -----------------------------------------------------------------------------
+_C.MASK = CfgNode()
+
+# Whether to enable Masked style pretraining.
+_C.MASK.ENABLE = False
+
+# Whether to enable MAE (discard encoder tokens).
+_C.MASK.MAE_ON = False
+
+# Whether to enable random masking in mae
+_C.MASK.MAE_RND_MASK = False
+
+# Whether to do random masking per-frame in mae
+_C.MASK.PER_FRAME_MASKING = False
+
+# only predict loss on temporal strided patches, or predict full time extent
+_C.MASK.TIME_STRIDE_LOSS = True
+
+# Whether to normalize the pred pixel loss
+_C.MASK.NORM_PRED_PIXEL = True
+
+# Whether to fix initialization with inverse depth of layer for pretraining.
+_C.MASK.SCALE_INIT_BY_DEPTH = False
+
+# Base embedding dimension for the decoder transformer.
+_C.MASK.DECODER_EMBED_DIM = 512
+
+# Base embedding dimension for the decoder transformer.
+_C.MASK.DECODER_SEP_POS_EMBED = False
+
+# Use a KV kernel in decoder?
+_C.MASK.DEC_KV_KERNEL = []
+
+# Use a KV stride in decoder?
+_C.MASK.DEC_KV_STRIDE = []
+
+# The depths of features which are inputs of the prediction head.
+_C.MASK.PRETRAIN_DEPTH = [15]
+
+# The type of Masked pretraining prediction head.
+# Can be "separate", "separate_xformer".
+_C.MASK.HEAD_TYPE = "separate"
+
+# The depth of MAE's decoder
+_C.MASK.DECODER_DEPTH = 0
+
+# The weight of HOG target loss.
+_C.MASK.PRED_HOG = False
+# Reversible Configs
+_C.MVIT.REV = CfgNode()
+
+# Enable Reversible Model
+_C.MVIT.REV.ENABLE = False
+
+# Method to fuse the reversible paths
+# see :class: `TwoStreamFusion` for all the options
+_C.MVIT.REV.RESPATH_FUSE = "concat"
+
+# Layers to buffer activations at
+# (at least Q-pooling layers needed)
+_C.MVIT.REV.BUFFER_LAYERS = []
+
+# 'conv' or 'max' operator for the respath in Qpooling
+_C.MVIT.REV.RES_PATH = "conv"
+
+# Method to merge hidden states before Qpoolinglayers
+_C.MVIT.REV.PRE_Q_FUSION = "avg"
+
+# -----------------------------------------------------------------------------
+# SlowFast options
+# -----------------------------------------------------------------------------
+_C.SLOWFAST = CfgNode()
+
+# Corresponds to the inverse of the channel reduction ratio, $\beta$ between
+# the Slow and Fast pathways.
+_C.SLOWFAST.BETA_INV = 8
+
+# Corresponds to the frame rate reduction ratio, $\alpha$ between the Slow and
+# Fast pathways.
+_C.SLOWFAST.ALPHA = 8
+
+# Ratio of channel dimensions between the Slow and Fast pathways.
+_C.SLOWFAST.FUSION_CONV_CHANNEL_RATIO = 2
+
+# Kernel dimension used for fusing information from Fast pathway to Slow
+# pathway.
+_C.SLOWFAST.FUSION_KERNEL_SZ = 5
+
+
+# -----------------------------------------------------------------------------
+# Data options
+# -----------------------------------------------------------------------------
+_C.DATA = CfgNode()
+
+# The path to the data directory.
+_C.DATA.PATH_TO_DATA_DIR = ""
+
+# The separator used between path and label.
+_C.DATA.PATH_LABEL_SEPARATOR = " "
+
+# Video path prefix if any.
+_C.DATA.PATH_PREFIX = ""
+
+# The number of frames of the input clip.
+_C.DATA.NUM_FRAMES = 8
+
+# The video sampling rate of the input clip.
+_C.DATA.SAMPLING_RATE = 8
+
+# Eigenvalues for PCA jittering. Note PCA is RGB based.
+_C.DATA.TRAIN_PCA_EIGVAL = [0.225, 0.224, 0.229]
+
+# Eigenvectors for PCA jittering.
+_C.DATA.TRAIN_PCA_EIGVEC = [
+    [-0.5675, 0.7192, 0.4009],
+    [-0.5808, -0.0045, -0.8140],
+    [-0.5836, -0.6948, 0.4203],
+]
+
+# If a imdb have been dumpped to a local file with the following format:
+# `{"im_path": im_path, "class": cont_id}`
+# then we can skip the construction of imdb and load it from the local file.
+_C.DATA.PATH_TO_PRELOAD_IMDB = ""
+
+# The mean value of the video raw pixels across the R G B channels.
+_C.DATA.MEAN = [0.45, 0.45, 0.45]
+# List of input frame channel dimensions.
+
+_C.DATA.INPUT_CHANNEL_NUM = [3, 3]
+
+# The std value of the video raw pixels across the R G B channels.
+_C.DATA.STD = [0.225, 0.225, 0.225]
+
+# The spatial augmentation jitter scales for training.
+_C.DATA.TRAIN_JITTER_SCALES = [256, 320]
+
+# The relative scale range of Inception-style area based random resizing augmentation.
+# If this is provided, DATA.TRAIN_JITTER_SCALES above is ignored.
+_C.DATA.TRAIN_JITTER_SCALES_RELATIVE = []
+
+# The relative aspect ratio range of Inception-style area based random resizing
+# augmentation.
+_C.DATA.TRAIN_JITTER_ASPECT_RELATIVE = []
+
+# If True, perform stride length uniform temporal sampling.
+_C.DATA.USE_OFFSET_SAMPLING = False
+
+# Whether to apply motion shift for augmentation.
+_C.DATA.TRAIN_JITTER_MOTION_SHIFT = False
+
+# The spatial crop size for training.
+_C.DATA.TRAIN_CROP_SIZE = 224
+
+# The spatial crop size for testing.
+_C.DATA.TEST_CROP_SIZE = 256
+
+# Input videos may has different fps, convert it to the target video fps before
+# frame sampling.
+_C.DATA.TARGET_FPS = 30
+
+# JITTER TARGET_FPS by +- this number randomly
+_C.DATA.TRAIN_JITTER_FPS = 0.0
+
+# Decoding backend, options include `pyav` or `torchvision`
+_C.DATA.DECODING_BACKEND = "torchvision"
+
+# Decoding resize to short size (set to native size for best speed)
+_C.DATA.DECODING_SHORT_SIZE = 256
+
+# if True, sample uniformly in [1 / max_scale, 1 / min_scale] and take a
+# reciprocal to get the scale. If False, take a uniform sample from
+# [min_scale, max_scale].
+_C.DATA.INV_UNIFORM_SAMPLE = False
+
+# If True, perform random horizontal flip on the video frames during training.
+_C.DATA.RANDOM_FLIP = True
+
+# If True, calculdate the map as metric.
+_C.DATA.MULTI_LABEL = False
+
+# Method to perform the ensemble, options include "sum" and "max".
+_C.DATA.ENSEMBLE_METHOD = "sum"
+
+# If True, revert the default input channel (RBG <-> BGR).
+_C.DATA.REVERSE_INPUT_CHANNEL = False
+
+# how many samples (=clips) to decode from a single video
+_C.DATA.TRAIN_CROP_NUM_TEMPORAL = 1
+
+# how many spatial samples to crop from a single clip
+_C.DATA.TRAIN_CROP_NUM_SPATIAL = 1
+
+# color random percentage for grayscale conversion
+_C.DATA.COLOR_RND_GRAYSCALE = 0.0
+
+# loader can read .csv file in chunks of this chunk size
+_C.DATA.LOADER_CHUNK_SIZE = 0
+
+# if LOADER_CHUNK_SIZE > 0, define overall length of .csv file
+_C.DATA.LOADER_CHUNK_OVERALL_SIZE = 0
+
+# for chunked reading, dataloader can skip rows in (large)
+# training csv file
+_C.DATA.SKIP_ROWS = 0
+
+# The separator used between path and label.
+_C.DATA.PATH_LABEL_SEPARATOR = " "
+
+# augmentation probability to convert raw decoded video to
+# grayscale temporal difference
+_C.DATA.TIME_DIFF_PROB = 0.0
+
+# Apply SSL-based SimCLR / MoCo v1/v2 color augmentations,
+#  with params below
+_C.DATA.SSL_COLOR_JITTER = False
+
+# color jitter percentage for brightness, contrast, saturation
+_C.DATA.SSL_COLOR_BRI_CON_SAT = [0.4, 0.4, 0.4]
+
+# color jitter percentage for hue
+_C.DATA.SSL_COLOR_HUE = 0.1
+
+# SimCLR / MoCo v2 augmentations on/off
+_C.DATA.SSL_MOCOV2_AUG = False
+
+# SimCLR / MoCo v2 blur augmentation minimum gaussian sigma
+_C.DATA.SSL_BLUR_SIGMA_MIN = [0.0, 0.1]
+
+# SimCLR / MoCo v2 blur augmentation maximum gaussian sigma
+_C.DATA.SSL_BLUR_SIGMA_MAX = [0.0, 2.0]
+
+
+# If combine train/val split as training for in21k
+_C.DATA.IN22K_TRAINVAL = False
+
+# If not None, use IN1k as val split when training in21k
+_C.DATA.IN22k_VAL_IN1K = ""
+
+# Large resolution models may use different crop ratios
+_C.DATA.IN_VAL_CROP_RATIO = 0.875  # 224/256 = 0.875
+
+# don't use real video for kinetics.py
+_C.DATA.DUMMY_LOAD = False
+
+# ---------------------------------------------------------------------------- #
+# Optimizer options
+# ---------------------------------------------------------------------------- #
+_C.SOLVER = CfgNode()
+
+# Base learning rate.
+_C.SOLVER.BASE_LR = 0.1
+
+# Learning rate policy (see utils/lr_policy.py for options and examples).
+_C.SOLVER.LR_POLICY = "cosine"
+
+# Final learning rates for 'cosine' policy.
+_C.SOLVER.COSINE_END_LR = 0.0
+
+# Exponential decay factor.
+_C.SOLVER.GAMMA = 0.1
+
+# Step size for 'exp' and 'cos' policies (in epochs).
+_C.SOLVER.STEP_SIZE = 1
+
+# Steps for 'steps_' policies (in epochs).
+_C.SOLVER.STEPS = []
+
+# Learning rates for 'steps_' policies.
+_C.SOLVER.LRS = []
+
+# Maximal number of epochs.
+_C.SOLVER.MAX_EPOCH = 300
+
+# Momentum.
+_C.SOLVER.MOMENTUM = 0.9
+
+# Momentum dampening.
+_C.SOLVER.DAMPENING = 0.0
+
+# Nesterov momentum.
+_C.SOLVER.NESTEROV = True
+
+# L2 regularization.
+_C.SOLVER.WEIGHT_DECAY = 1e-4
+
+# Start the warm up from SOLVER.BASE_LR * SOLVER.WARMUP_FACTOR.
+_C.SOLVER.WARMUP_FACTOR = 0.1
+
+# Gradually warm up the SOLVER.BASE_LR over this number of epochs.
+_C.SOLVER.WARMUP_EPOCHS = 0.0
+
+# The start learning rate of the warm up.
+_C.SOLVER.WARMUP_START_LR = 0.01
+
+# Optimization method.
+_C.SOLVER.OPTIMIZING_METHOD = "sgd"
+
+# Base learning rate is linearly scaled with NUM_SHARDS.
+_C.SOLVER.BASE_LR_SCALE_NUM_SHARDS = False
+
+# If True, start from the peak cosine learning rate after warm up.
+_C.SOLVER.COSINE_AFTER_WARMUP = False
+
+# If True, perform no weight decay on parameter with one dimension (bias term, etc).
+_C.SOLVER.ZERO_WD_1D_PARAM = False
+
+# Clip gradient at this value before optimizer update
+_C.SOLVER.CLIP_GRAD_VAL = None
+
+# Clip gradient at this norm before optimizer update
+_C.SOLVER.CLIP_GRAD_L2NORM = None
+
+# LARS optimizer
+_C.SOLVER.LARS_ON = False
+
+# The layer-wise decay of learning rate. Set to 1. to disable.
+_C.SOLVER.LAYER_DECAY = 1.0
+
+# Adam's beta
+_C.SOLVER.BETAS = (0.9, 0.999)
+# ---------------------------------------------------------------------------- #
+# Misc options
+# ---------------------------------------------------------------------------- #
+
+# The name of the current task; e.g. "ssl"/"sl" for (self)supervised learning
+_C.TASK = ""
+
+# Number of GPUs to use (applies to both training and testing).
+_C.NUM_GPUS = 1
+
+# Number of machine to use for the job.
+_C.NUM_SHARDS = 1
+
+# The index of the current machine.
+_C.SHARD_ID = 0
+
+# Output basedir.
+_C.OUTPUT_DIR = "."
+
+# Note that non-determinism may still be present due to non-deterministic
+# operator implementations in GPU operator libraries.
+_C.RNG_SEED = 1
+
+# Log period in iters.
+_C.LOG_PERIOD = 10
+
+# If True, log the model info.
+_C.LOG_MODEL_INFO = True
+
+# Distributed backend.
+_C.DIST_BACKEND = "nccl"
+
+# ---------------------------------------------------------------------------- #
+# Benchmark options
+# ---------------------------------------------------------------------------- #
+_C.BENCHMARK = CfgNode()
+
+# Number of epochs for data loading benchmark.
+_C.BENCHMARK.NUM_EPOCHS = 5
+
+# Log period in iters for data loading benchmark.
+_C.BENCHMARK.LOG_PERIOD = 100
+
+# If True, shuffle dataloader for epoch during benchmark.
+_C.BENCHMARK.SHUFFLE = True
+
+
+# ---------------------------------------------------------------------------- #
+# Common train/test data loader options
+# ---------------------------------------------------------------------------- #
+_C.DATA_LOADER = CfgNode()
+
+# Number of data loader workers per training process.
+_C.DATA_LOADER.NUM_WORKERS = 8
+
+# Load data to pinned host memory.
+_C.DATA_LOADER.PIN_MEMORY = True
+
+# Enable multi thread decoding.
+_C.DATA_LOADER.ENABLE_MULTI_THREAD_DECODE = False
+
+
+# ---------------------------------------------------------------------------- #
+# Detection options.
+# ---------------------------------------------------------------------------- #
+_C.DETECTION = CfgNode()
+
+# Whether enable video detection.
+_C.DETECTION.ENABLE = False
+
+# Aligned version of RoI. More details can be found at slowfast/models/head_helper.py
+_C.DETECTION.ALIGNED = True
+
+# Spatial scale factor.
+_C.DETECTION.SPATIAL_SCALE_FACTOR = 16
+
+# RoI tranformation resolution.
+_C.DETECTION.ROI_XFORM_RESOLUTION = 7
+
+
+# -----------------------------------------------------------------------------
+# AVA Dataset options
+# -----------------------------------------------------------------------------
+_C.AVA = CfgNode()
+
+# Directory path of frames.
+_C.AVA.FRAME_DIR = "/mnt/fair-flash3-east/ava_trainval_frames.img/"
+
+# Directory path for files of frame lists.
+_C.AVA.FRAME_LIST_DIR = (
+    "/mnt/vol/gfsai-flash3-east/ai-group/users/haoqifan/ava/frame_list/"
+)
+
+# Directory path for annotation files.
+_C.AVA.ANNOTATION_DIR = (
+    "/mnt/vol/gfsai-flash3-east/ai-group/users/haoqifan/ava/frame_list/"
+)
+
+# Filenames of training samples list files.
+_C.AVA.TRAIN_LISTS = ["train.csv"]
+
+# Filenames of test samples list files.
+_C.AVA.TEST_LISTS = ["val.csv"]
+
+# Filenames of box list files for training. Note that we assume files which
+# contains predicted boxes will have a suffix "predicted_boxes" in the
+# filename.
+_C.AVA.TRAIN_GT_BOX_LISTS = ["ava_train_v2.2.csv"]
+_C.AVA.TRAIN_PREDICT_BOX_LISTS = []
+
+# Filenames of box list files for test.
+_C.AVA.TEST_PREDICT_BOX_LISTS = ["ava_val_predicted_boxes.csv"]
+
+# This option controls the score threshold for the predicted boxes to use.
+_C.AVA.DETECTION_SCORE_THRESH = 0.9
+
+# If use BGR as the format of input frames.
+_C.AVA.BGR = False
+
+# Training augmentation parameters
+# Whether to use color augmentation method.
+_C.AVA.TRAIN_USE_COLOR_AUGMENTATION = False
+
+# Whether to only use PCA jitter augmentation when using color augmentation
+# method (otherwise combine with color jitter method).
+_C.AVA.TRAIN_PCA_JITTER_ONLY = True
+
+# Whether to do horizontal flipping during test.
+_C.AVA.TEST_FORCE_FLIP = False
+
+# Whether to use full test set for validation split.
+_C.AVA.FULL_TEST_ON_VAL = False
+
+# The name of the file to the ava label map.
+_C.AVA.LABEL_MAP_FILE = "ava_action_list_v2.2_for_activitynet_2019.pbtxt"
+
+# The name of the file to the ava exclusion.
+_C.AVA.EXCLUSION_FILE = "ava_val_excluded_timestamps_v2.2.csv"
+
+# The name of the file to the ava groundtruth.
+_C.AVA.GROUNDTRUTH_FILE = "ava_val_v2.2.csv"
+
+# Backend to process image, includes `pytorch` and `cv2`.
+_C.AVA.IMG_PROC_BACKEND = "cv2"
+
+# ---------------------------------------------------------------------------- #
+# Multigrid training options
+# See https://arxiv.org/abs/1912.00998 for details about multigrid training.
+# ---------------------------------------------------------------------------- #
+_C.MULTIGRID = CfgNode()
+
+# Multigrid training allows us to train for more epochs with fewer iterations.
+# This hyperparameter specifies how many times more epochs to train.
+# The default setting in paper trains for 1.5x more epochs than baseline.
+_C.MULTIGRID.EPOCH_FACTOR = 1.5
+
+# Enable short cycles.
+_C.MULTIGRID.SHORT_CYCLE = False
+# Short cycle additional spatial dimensions relative to the default crop size.
+_C.MULTIGRID.SHORT_CYCLE_FACTORS = [0.5, 0.5**0.5]
+
+_C.MULTIGRID.LONG_CYCLE = False
+# (Temporal, Spatial) dimensions relative to the default shape.
+_C.MULTIGRID.LONG_CYCLE_FACTORS = [
+    (0.25, 0.5**0.5),
+    (0.5, 0.5**0.5),
+    (0.5, 1),
+    (1, 1),
+]
+
+# While a standard BN computes stats across all examples in a GPU,
+# for multigrid training we fix the number of clips to compute BN stats on.
+# See https://arxiv.org/abs/1912.00998 for details.
+_C.MULTIGRID.BN_BASE_SIZE = 8
+
+# Multigrid training epochs are not proportional to actual training time or
+# computations, so _C.TRAIN.EVAL_PERIOD leads to too frequent or rare
+# evaluation. We use a multigrid-specific rule to determine when to evaluate:
+# This hyperparameter defines how many times to evaluate a model per long
+# cycle shape.
+_C.MULTIGRID.EVAL_FREQ = 3
+
+# No need to specify; Set automatically and used as global variables.
+_C.MULTIGRID.LONG_CYCLE_SAMPLING_RATE = 0
+_C.MULTIGRID.DEFAULT_B = 0
+_C.MULTIGRID.DEFAULT_T = 0
+_C.MULTIGRID.DEFAULT_S = 0
+
+# -----------------------------------------------------------------------------
+# Tensorboard Visualization Options
+# -----------------------------------------------------------------------------
+_C.TENSORBOARD = CfgNode()
+
+# Log to summary writer, this will automatically.
+# log loss, lr and metrics during train/eval.
+_C.TENSORBOARD.ENABLE = False
+# Provide path to prediction results for visualization.
+# This is a pickle file of [prediction_tensor, label_tensor]
+_C.TENSORBOARD.PREDICTIONS_PATH = ""
+# Path to directory for tensorboard logs.
+# Default to to cfg.OUTPUT_DIR/runs-{cfg.TRAIN.DATASET}.
+_C.TENSORBOARD.LOG_DIR = ""
+# Path to a json file providing class_name - id mapping
+# in the format {"class_name1": id1, "class_name2": id2, ...}.
+# This file must be provided to enable plotting confusion matrix
+# by a subset or parent categories.
+_C.TENSORBOARD.CLASS_NAMES_PATH = ""
+
+# Path to a json file for categories -> classes mapping
+# in the format {"parent_class": ["child_class1", "child_class2",...], ...}.
+_C.TENSORBOARD.CATEGORIES_PATH = ""
+
+# Config for confusion matrices visualization.
+_C.TENSORBOARD.CONFUSION_MATRIX = CfgNode()
+# Visualize confusion matrix.
+_C.TENSORBOARD.CONFUSION_MATRIX.ENABLE = False
+# Figure size of the confusion matrices plotted.
+_C.TENSORBOARD.CONFUSION_MATRIX.FIGSIZE = [8, 8]
+# Path to a subset of categories to visualize.
+# File contains class names separated by newline characters.
+_C.TENSORBOARD.CONFUSION_MATRIX.SUBSET_PATH = ""
+
+# Config for histogram visualization.
+_C.TENSORBOARD.HISTOGRAM = CfgNode()
+# Visualize histograms.
+_C.TENSORBOARD.HISTOGRAM.ENABLE = False
+# Path to a subset of classes to plot histograms.
+# Class names must be separated by newline characters.
+_C.TENSORBOARD.HISTOGRAM.SUBSET_PATH = ""
+# Visualize top-k most predicted classes on histograms for each
+# chosen true label.
+_C.TENSORBOARD.HISTOGRAM.TOPK = 10
+# Figure size of the histograms plotted.
+_C.TENSORBOARD.HISTOGRAM.FIGSIZE = [8, 8]
+
+# Config for layers' weights and activations visualization.
+# _C.TENSORBOARD.ENABLE must be True.
+_C.TENSORBOARD.MODEL_VIS = CfgNode()
+
+# If False, skip model visualization.
+_C.TENSORBOARD.MODEL_VIS.ENABLE = False
+
+# If False, skip visualizing model weights.
+_C.TENSORBOARD.MODEL_VIS.MODEL_WEIGHTS = False
+
+# If False, skip visualizing model activations.
+_C.TENSORBOARD.MODEL_VIS.ACTIVATIONS = False
+
+# If False, skip visualizing input videos.
+_C.TENSORBOARD.MODEL_VIS.INPUT_VIDEO = False
+
+
+# List of strings containing data about layer names and their indexing to
+# visualize weights and activations for. The indexing is meant for
+# choosing a subset of activations outputed by a layer for visualization.
+# If indexing is not specified, visualize all activations outputed by the layer.
+# For each string, layer name and indexing is separated by whitespaces.
+# e.g.: [layer1 1,2;1,2, layer2, layer3 150,151;3,4]; this means for each array `arr`
+# along the batch dimension in `layer1`, we take arr[[1, 2], [1, 2]]
+_C.TENSORBOARD.MODEL_VIS.LAYER_LIST = []
+# Top-k predictions to plot on videos
+_C.TENSORBOARD.MODEL_VIS.TOPK_PREDS = 1
+# Colormap to for text boxes and bounding boxes colors
+_C.TENSORBOARD.MODEL_VIS.COLORMAP = "Pastel2"
+# Config for visualization video inputs with Grad-CAM.
+# _C.TENSORBOARD.ENABLE must be True.
+_C.TENSORBOARD.MODEL_VIS.GRAD_CAM = CfgNode()
+# Whether to run visualization using Grad-CAM technique.
+_C.TENSORBOARD.MODEL_VIS.GRAD_CAM.ENABLE = True
+# CNN layers to use for Grad-CAM. The number of layers must be equal to
+# number of pathway(s).
+_C.TENSORBOARD.MODEL_VIS.GRAD_CAM.LAYER_LIST = []
+# If True, visualize Grad-CAM using true labels for each instances.
+# If False, use the highest predicted class.
+_C.TENSORBOARD.MODEL_VIS.GRAD_CAM.USE_TRUE_LABEL = False
+# Colormap to for text boxes and bounding boxes colors
+_C.TENSORBOARD.MODEL_VIS.GRAD_CAM.COLORMAP = "viridis"
+
+# Config for visualization for wrong prediction visualization.
+# _C.TENSORBOARD.ENABLE must be True.
+_C.TENSORBOARD.WRONG_PRED_VIS = CfgNode()
+_C.TENSORBOARD.WRONG_PRED_VIS.ENABLE = False
+# Folder tag to origanize model eval videos under.
+_C.TENSORBOARD.WRONG_PRED_VIS.TAG = "Incorrectly classified videos."
+# Subset of labels to visualize. Only wrong predictions with true labels
+# within this subset is visualized.
+_C.TENSORBOARD.WRONG_PRED_VIS.SUBSET_PATH = ""
+
+
+# ---------------------------------------------------------------------------- #
+# Demo options
+# ---------------------------------------------------------------------------- #
+_C.DEMO = CfgNode()
+
+# Run model in DEMO mode.
+_C.DEMO.ENABLE = False
+
+# Path to a json file providing class_name - id mapping
+# in the format {"class_name1": id1, "class_name2": id2, ...}.
+_C.DEMO.LABEL_FILE_PATH = ""
+
+# Specify a camera device as input. This will be prioritized
+# over input video if set.
+# If -1, use input video instead.
+_C.DEMO.WEBCAM = -1
+
+# Path to input video for demo.
+_C.DEMO.INPUT_VIDEO = ""
+# Custom width for reading input video data.
+_C.DEMO.DISPLAY_WIDTH = 0
+# Custom height for reading input video data.
+_C.DEMO.DISPLAY_HEIGHT = 0
+# Path to Detectron2 object detection model configuration,
+# only used for detection tasks.
+_C.DEMO.DETECTRON2_CFG = "COCO-Detection/faster_rcnn_R_50_FPN_3x.yaml"
+# Path to Detectron2 object detection model pre-trained weights.
+_C.DEMO.DETECTRON2_WEIGHTS = "detectron2://COCO-Detection/faster_rcnn_R_50_FPN_3x/137849458/model_final_280758.pkl"
+# Threshold for choosing predicted bounding boxes by Detectron2.
+_C.DEMO.DETECTRON2_THRESH = 0.9
+# Number of overlapping frames between 2 consecutive clips.
+# Increase this number for more frequent action predictions.
+# The number of overlapping frames cannot be larger than
+# half of the sequence length `cfg.DATA.NUM_FRAMES * cfg.DATA.SAMPLING_RATE`
+_C.DEMO.BUFFER_SIZE = 0
+# If specified, the visualized outputs will be written this a video file of
+# this path. Otherwise, the visualized outputs will be displayed in a window.
+_C.DEMO.OUTPUT_FILE = ""
+# Frames per second rate for writing to output video file.
+# If not set (-1), use fps rate from input file.
+_C.DEMO.OUTPUT_FPS = -1
+# Input format from demo video reader ("RGB" or "BGR").
+_C.DEMO.INPUT_FORMAT = "BGR"
+# Draw visualization frames in [keyframe_idx - CLIP_VIS_SIZE, keyframe_idx + CLIP_VIS_SIZE] inclusively.
+_C.DEMO.CLIP_VIS_SIZE = 10
+# Number of processes to run video visualizer.
+_C.DEMO.NUM_VIS_INSTANCES = 2
+
+# Path to pre-computed predicted boxes
+_C.DEMO.PREDS_BOXES = ""
+# Whether to run in with multi-threaded video reader.
+_C.DEMO.THREAD_ENABLE = False
+# Take one clip for every `DEMO.NUM_CLIPS_SKIP` + 1 for prediction and visualization.
+# This is used for fast demo speed by reducing the prediction/visualiztion frequency.
+# If -1, take the most recent read clip for visualization. This mode is only supported
+# if `DEMO.THREAD_ENABLE` is set to True.
+_C.DEMO.NUM_CLIPS_SKIP = 0
+# Path to ground-truth boxes and labels (optional)
+_C.DEMO.GT_BOXES = ""
+# The starting second of the video w.r.t bounding boxes file.
+_C.DEMO.STARTING_SECOND = 900
+# Frames per second of the input video/folder of images.
+_C.DEMO.FPS = 30
+# Visualize with top-k predictions or predictions above certain threshold(s).
+# Option: {"thres", "top-k"}
+_C.DEMO.VIS_MODE = "thres"
+# Threshold for common class names.
+_C.DEMO.COMMON_CLASS_THRES = 0.7
+# Theshold for uncommon class names. This will not be
+# used if `_C.DEMO.COMMON_CLASS_NAMES` is empty.
+_C.DEMO.UNCOMMON_CLASS_THRES = 0.3
+# This is chosen based on distribution of examples in
+# each classes in AVA dataset.
+_C.DEMO.COMMON_CLASS_NAMES = [
+    "watch (a person)",
+    "talk to (e.g., self, a person, a group)",
+    "listen to (a person)",
+    "touch (an object)",
+    "carry/hold (an object)",
+    "walk",
+    "sit",
+    "lie/sleep",
+    "bend/bow (at the waist)",
+]
+# Slow-motion rate for the visualization. The visualized portions of the
+# video will be played `_C.DEMO.SLOWMO` times slower than usual speed.
+_C.DEMO.SLOWMO = 1
+
+
+def assert_and_infer_cfg(cfg):
+    # BN assertions.
+    if cfg.BN.USE_PRECISE_STATS:
+        assert cfg.BN.NUM_BATCHES_PRECISE >= 0
+    # TRAIN assertions.
+    assert cfg.TRAIN.CHECKPOINT_TYPE in ["pytorch", "caffe2"]
+    assert cfg.NUM_GPUS == 0 or cfg.TRAIN.BATCH_SIZE % cfg.NUM_GPUS == 0
+
+    # TEST assertions.
+    assert cfg.TEST.CHECKPOINT_TYPE in ["pytorch", "caffe2"]
+    assert cfg.NUM_GPUS == 0 or cfg.TEST.BATCH_SIZE % cfg.NUM_GPUS == 0
+
+    # RESNET assertions.
+    assert cfg.RESNET.NUM_GROUPS > 0
+    assert cfg.RESNET.WIDTH_PER_GROUP > 0
+    assert cfg.RESNET.WIDTH_PER_GROUP % cfg.RESNET.NUM_GROUPS == 0
+
+    # Execute LR scaling by num_shards.
+    if cfg.SOLVER.BASE_LR_SCALE_NUM_SHARDS:
+        cfg.SOLVER.BASE_LR *= cfg.NUM_SHARDS
+        cfg.SOLVER.WARMUP_START_LR *= cfg.NUM_SHARDS
+        cfg.SOLVER.COSINE_END_LR *= cfg.NUM_SHARDS
+
+    # General assertions.
+    assert cfg.SHARD_ID < cfg.NUM_SHARDS
+    return cfg
+
+
+def get_cfg():
+    return _C.clone()
+
+
+def load_config(path_to_config=None):
+    # Setup cfg.
+    cfg = get_cfg()
+
+    # Load config from cfg.
+    if path_to_config is not None:
+        cfg.merge_from_file(path_to_config)
+    return cfg
diff --git a/src/kabr_tools/utils/slowfast/head.py b/src/kabr_tools/utils/slowfast/head.py
new file mode 100644
index 0000000..c16906c
--- /dev/null
+++ b/src/kabr_tools/utils/slowfast/head.py
@@ -0,0 +1,145 @@
+# Copyright (c) Facebook, Inc. and its affiliates. All Rights Reserved.
+
+"""ResNe(X)t Head helper."""
+
+import torch.nn as nn
+
+
+class X3DHead(nn.Module):
+    """
+    X3D head.
+    This layer performs a fully-connected projection during training, when the
+    input size is 1x1x1. It performs a convolutional projection during testing
+    when the input size is larger than 1x1x1. If the inputs are from multiple
+    different pathways, the inputs will be concatenated after pooling.
+    """
+
+    def __init__(
+        self,
+        dim_in,
+        dim_inner,
+        dim_out,
+        num_classes,
+        pool_size,
+        dropout_rate=0.0,
+        act_func="softmax",
+        inplace_relu=True,
+        eps=1e-5,
+        bn_mmt=0.1,
+        norm_module=nn.BatchNorm3d,
+        bn_lin5_on=False,
+    ):
+        """
+        The `__init__` method of any subclass should also contain these
+            arguments.
+        X3DHead takes a 5-dim feature tensor (BxCxTxHxW) as input.
+
+        Args:
+            dim_in (float): the channel dimension C of the input.
+            num_classes (int): the channel dimensions of the output.
+            pool_size (float): a single entry list of kernel size for
+                spatiotemporal pooling for the TxHxW dimensions.
+            dropout_rate (float): dropout rate. If equal to 0.0, perform no
+                dropout.
+            act_func (string): activation function to use. 'softmax': applies
+                softmax on the output. 'sigmoid': applies sigmoid on the output.
+            inplace_relu (bool): if True, calculate the relu on the original
+                input without allocating new memory.
+            eps (float): epsilon for batch norm.
+            bn_mmt (float): momentum for batch norm. Noted that BN momentum in
+                PyTorch = 1 - BN momentum in Caffe2.
+            norm_module (nn.Module): nn.Module for the normalization layer. The
+                default is nn.BatchNorm3d.
+            bn_lin5_on (bool): if True, perform normalization on the features
+                before the classifier.
+        """
+        super(X3DHead, self).__init__()
+        self.pool_size = pool_size
+        self.dropout_rate = dropout_rate
+        self.num_classes = num_classes
+        self.act_func = act_func
+        self.eps = eps
+        self.bn_mmt = bn_mmt
+        self.inplace_relu = inplace_relu
+        self.bn_lin5_on = bn_lin5_on
+        self._construct_head(dim_in, dim_inner, dim_out, norm_module)
+
+    def _construct_head(self, dim_in, dim_inner, dim_out, norm_module):
+
+        self.conv_5 = nn.Conv3d(
+            dim_in,
+            dim_inner,
+            kernel_size=(1, 1, 1),
+            stride=(1, 1, 1),
+            padding=(0, 0, 0),
+            bias=False,
+        )
+        self.conv_5_bn = norm_module(
+            num_features=dim_inner, eps=self.eps, momentum=self.bn_mmt
+        )
+        self.conv_5_relu = nn.ReLU(self.inplace_relu)
+
+        if self.pool_size is None:
+            self.avg_pool = nn.AdaptiveAvgPool3d((1, 1, 1))
+        else:
+            self.avg_pool = nn.AvgPool3d(self.pool_size, stride=1)
+
+        self.lin_5 = nn.Conv3d(
+            dim_inner,
+            dim_out,
+            kernel_size=(1, 1, 1),
+            stride=(1, 1, 1),
+            padding=(0, 0, 0),
+            bias=False,
+        )
+        if self.bn_lin5_on:
+            self.lin_5_bn = norm_module(
+                num_features=dim_out, eps=self.eps, momentum=self.bn_mmt
+            )
+        self.lin_5_relu = nn.ReLU(self.inplace_relu)
+
+        if self.dropout_rate > 0.0:
+            self.dropout = nn.Dropout(self.dropout_rate)
+        # Perform FC in a fully convolutional manner. The FC layer will be
+        # initialized with a different std comparing to convolutional layers.
+        self.projection = nn.Linear(dim_out, self.num_classes, bias=True)
+
+        # Softmax for evaluation and testing.
+        if self.act_func == "softmax":
+            self.act = nn.Softmax(dim=4)
+        elif self.act_func == "sigmoid":
+            self.act = nn.Sigmoid()
+        else:
+            raise NotImplementedError(
+                "{} is not supported as an activation" "function.".format(
+                    self.act_func)
+            )
+
+    def forward(self, inputs):
+        # In its current design the X3D head is only useable for a single
+        # pathway input.
+        assert len(inputs) == 1, "Input tensor does not contain 1 pathway"
+        x = self.conv_5(inputs[0])
+        x = self.conv_5_bn(x)
+        x = self.conv_5_relu(x)
+        x = self.avg_pool(x)
+
+        x = self.lin_5(x)
+        if self.bn_lin5_on:
+            x = self.lin_5_bn(x)
+        x = self.lin_5_relu(x)
+
+        # (N, C, T, H, W) -> (N, T, H, W, C).
+        x = x.permute((0, 2, 3, 4, 1))
+        # Perform dropout.
+        if hasattr(self, "dropout"):
+            x = self.dropout(x)
+        x = self.projection(x)
+
+        # Performs fully convlutional inference.
+        if not self.training:
+            x = self.act(x)
+            x = x.mean([1, 2, 3])
+
+        x = x.view(x.shape[0], -1)
+        return x
diff --git a/src/kabr_tools/utils/slowfast/norm.py b/src/kabr_tools/utils/slowfast/norm.py
new file mode 100644
index 0000000..34fd479
--- /dev/null
+++ b/src/kabr_tools/utils/slowfast/norm.py
@@ -0,0 +1,109 @@
+# Copyright (c) Facebook, Inc. and its affiliates. All Rights Reserved.
+
+"""BatchNorm (BN) utility functions and custom batch-size BN implementations"""
+
+from functools import partial
+
+import torch
+import torch.nn as nn
+
+from pytorchvideo.layers.batch_norm import NaiveSyncBatchNorm3d
+
+
+def get_norm(cfg):
+    """
+    Args:
+        cfg (CfgNode): model building configs, details are in the comments of
+            the config file.
+    Returns:
+        nn.Module: the normalization layer.
+    """
+    if cfg.BN.NORM_TYPE in {"batchnorm", "sync_batchnorm_apex"}:
+        return nn.BatchNorm3d
+    elif cfg.BN.NORM_TYPE == "sub_batchnorm":
+        return partial(SubBatchNorm3d, num_splits=cfg.BN.NUM_SPLITS)
+    elif cfg.BN.NORM_TYPE == "sync_batchnorm":
+        return partial(
+            NaiveSyncBatchNorm3d,
+            num_sync_devices=cfg.BN.NUM_SYNC_DEVICES,
+            global_sync=cfg.BN.GLOBAL_SYNC,
+        )
+    else:
+        raise NotImplementedError(
+            "Norm type {} is not supported".format(cfg.BN.NORM_TYPE)
+        )
+
+
+class SubBatchNorm3d(nn.Module):
+    """
+    The standard BN layer computes stats across all examples in a GPU. In some
+    cases it is desirable to compute stats across only a subset of examples
+    (e.g., in multigrid training https://arxiv.org/abs/1912.00998).
+    SubBatchNorm3d splits the batch dimension into N splits, and run BN on
+    each of them separately (so that the stats are computed on each subset of
+    examples (1/N of batch) independently. During evaluation, it aggregates
+    the stats from all splits into one BN.
+    """
+
+    def __init__(self, num_splits, **args):
+        """
+        Args:
+            num_splits (int): number of splits.
+            args (list): other arguments.
+        """
+        super(SubBatchNorm3d, self).__init__()
+        self.num_splits = num_splits
+        num_features = args["num_features"]
+        # Keep only one set of weight and bias.
+        if args.get("affine", True):
+            self.affine = True
+            args["affine"] = False
+            self.weight = torch.nn.Parameter(torch.ones(num_features))
+            self.bias = torch.nn.Parameter(torch.zeros(num_features))
+        else:
+            self.affine = False
+        self.bn = nn.BatchNorm3d(**args)
+        args["num_features"] = num_features * num_splits
+        self.split_bn = nn.BatchNorm3d(**args)
+
+    def _get_aggregated_mean_std(self, means, stds, n):
+        """
+        Calculate the aggregated mean and stds.
+        Args:
+            means (tensor): mean values.
+            stds (tensor): standard deviations.
+            n (int): number of sets of means and stds.
+        """
+        mean = means.view(n, -1).sum(0) / n
+        std = (
+            stds.view(n, -1).sum(0) / n
+            + ((means.view(n, -1) - mean) ** 2).view(n, -1).sum(0) / n
+        )
+        return mean.detach(), std.detach()
+
+    def aggregate_stats(self):
+        """
+        Synchronize running_mean, and running_var. Call this before eval.
+        """
+        if self.split_bn.track_running_stats:
+            (
+                self.bn.running_mean.data,
+                self.bn.running_var.data,
+            ) = self._get_aggregated_mean_std(
+                self.split_bn.running_mean,
+                self.split_bn.running_var,
+                self.num_splits,
+            )
+
+    def forward(self, x):
+        if self.training:
+            n, c, t, h, w = x.shape
+            x = x.view(n // self.num_splits, c * self.num_splits, t, h, w)
+            x = self.split_bn(x)
+            x = x.view(n, c, t, h, w)
+        else:
+            x = self.bn(x)
+        if self.affine:
+            x = x * self.weight.view((-1, 1, 1, 1))
+            x = x + self.bias.view((-1, 1, 1, 1))
+        return x
diff --git a/src/kabr_tools/utils/slowfast/resnet.py b/src/kabr_tools/utils/slowfast/resnet.py
new file mode 100644
index 0000000..d98c0f2
--- /dev/null
+++ b/src/kabr_tools/utils/slowfast/resnet.py
@@ -0,0 +1,926 @@
+# Copyright (c) Facebook, Inc. and its affiliates. All Rights Reserved.
+
+"""Video models."""
+
+import torch
+import torch.nn as nn
+from pytorchvideo.layers.swish import Swish
+
+def drop_path(x, drop_prob: float = 0.0, training: bool = False):
+    """
+    Stochastic Depth per sample.
+    """
+    if drop_prob == 0.0 or not training:
+        return x
+    keep_prob = 1 - drop_prob
+    shape = (x.shape[0],) + (1,) * (
+        x.ndim - 1
+    )  # work with diff dim tensors, not just 2D ConvNets
+    mask = keep_prob + torch.rand(shape, dtype=x.dtype, device=x.device)
+    mask.floor_()  # binarize
+    output = x.div(keep_prob) * mask
+    return output
+
+class Nonlocal(nn.Module):
+    """
+    Builds Non-local Neural Networks as a generic family of building
+    blocks for capturing long-range dependencies. Non-local Network
+    computes the response at a position as a weighted sum of the
+    features at all positions. This building block can be plugged into
+    many computer vision architectures.
+    More details in the paper: https://arxiv.org/pdf/1711.07971.pdf
+    """
+
+    def __init__(
+        self,
+        dim,
+        dim_inner,
+        pool_size=None,
+        instantiation="softmax",
+        zero_init_final_conv=False,
+        zero_init_final_norm=True,
+        norm_eps=1e-5,
+        norm_momentum=0.1,
+        norm_module=nn.BatchNorm3d,
+    ):
+        """
+        Args:
+            dim (int): number of dimension for the input.
+            dim_inner (int): number of dimension inside of the Non-local block.
+            pool_size (list): the kernel size of spatial temporal pooling,
+                temporal pool kernel size, spatial pool kernel size, spatial
+                pool kernel size in order. By default pool_size is None,
+                then there would be no pooling used.
+            instantiation (string): supports two different instantiation method:
+                "dot_product": normalizing correlation matrix with L2.
+                "softmax": normalizing correlation matrix with Softmax.
+            zero_init_final_conv (bool): If true, zero initializing the final
+                convolution of the Non-local block.
+            zero_init_final_norm (bool):
+                If true, zero initializing the final batch norm of the Non-local
+                block.
+            norm_module (nn.Module): nn.Module for the normalization layer. The
+                default is nn.BatchNorm3d.
+        """
+        super(Nonlocal, self).__init__()
+        self.dim = dim
+        self.dim_inner = dim_inner
+        self.pool_size = pool_size
+        self.instantiation = instantiation
+        self.use_pool = (
+            False if pool_size is None else any((size > 1 for size in pool_size))
+        )
+        self.norm_eps = norm_eps
+        self.norm_momentum = norm_momentum
+        self._construct_nonlocal(
+            zero_init_final_conv, zero_init_final_norm, norm_module
+        )
+
+    def _construct_nonlocal(
+        self, zero_init_final_conv, zero_init_final_norm, norm_module
+    ):
+        # Three convolution heads: theta, phi, and g.
+        self.conv_theta = nn.Conv3d(
+            self.dim, self.dim_inner, kernel_size=1, stride=1, padding=0
+        )
+        self.conv_phi = nn.Conv3d(
+            self.dim, self.dim_inner, kernel_size=1, stride=1, padding=0
+        )
+        self.conv_g = nn.Conv3d(
+            self.dim, self.dim_inner, kernel_size=1, stride=1, padding=0
+        )
+
+        # Final convolution output.
+        self.conv_out = nn.Conv3d(
+            self.dim_inner, self.dim, kernel_size=1, stride=1, padding=0
+        )
+        # Zero initializing the final convolution output.
+        self.conv_out.zero_init = zero_init_final_conv
+
+        # TODO: change the name to `norm`
+        self.bn = norm_module(
+            num_features=self.dim,
+            eps=self.norm_eps,
+            momentum=self.norm_momentum,
+        )
+        # Zero initializing the final bn.
+        self.bn.transform_final_bn = zero_init_final_norm
+
+        # Optional to add the spatial-temporal pooling.
+        if self.use_pool:
+            self.pool = nn.MaxPool3d(
+                kernel_size=self.pool_size,
+                stride=self.pool_size,
+                padding=[0, 0, 0],
+            )
+
+    def forward(self, x):
+        x_identity = x
+        N, C, T, H, W = x.size()
+
+        theta = self.conv_theta(x)
+
+        # Perform temporal-spatial pooling to reduce the computation.
+        if self.use_pool:
+            x = self.pool(x)
+
+        phi = self.conv_phi(x)
+        g = self.conv_g(x)
+
+        theta = theta.view(N, self.dim_inner, -1)
+        phi = phi.view(N, self.dim_inner, -1)
+        g = g.view(N, self.dim_inner, -1)
+
+        # (N, C, TxHxW) * (N, C, TxHxW) => (N, TxHxW, TxHxW).
+        theta_phi = torch.einsum("nct,ncp->ntp", (theta, phi))
+        # For original Non-local paper, there are two main ways to normalize
+        # the affinity tensor:
+        #   1) Softmax normalization (norm on exp).
+        #   2) dot_product normalization.
+        if self.instantiation == "softmax":
+            # Normalizing the affinity tensor theta_phi before softmax.
+            theta_phi = theta_phi * (self.dim_inner**-0.5)
+            theta_phi = nn.functional.softmax(theta_phi, dim=2)
+        elif self.instantiation == "dot_product":
+            spatial_temporal_dim = theta_phi.shape[2]
+            theta_phi = theta_phi / spatial_temporal_dim
+        else:
+            raise NotImplementedError("Unknown norm type {}".format(self.instantiation))
+
+        # (N, TxHxW, TxHxW) * (N, C, TxHxW) => (N, C, TxHxW).
+        theta_phi_g = torch.einsum("ntg,ncg->nct", (theta_phi, g))
+
+        # (N, C, TxHxW) => (N, C, T, H, W).
+        theta_phi_g = theta_phi_g.view(N, self.dim_inner, T, H, W)
+
+        p = self.conv_out(theta_phi_g)
+        p = self.bn(p)
+        return x_identity + p
+
+class SE(nn.Module):
+    """Squeeze-and-Excitation (SE) block w/ Swish: AvgPool, FC, Swish, FC, Sigmoid."""
+
+    def _round_width(self, width, multiplier, min_width=8, divisor=8):
+        """
+        Round width of filters based on width multiplier
+        Args:
+            width (int): the channel dimensions of the input.
+            multiplier (float): the multiplication factor.
+            min_width (int): the minimum width after multiplication.
+            divisor (int): the new width should be dividable by divisor.
+        """
+        if not multiplier:
+            return width
+
+        width *= multiplier
+        min_width = min_width or divisor
+        width_out = max(min_width, int(width + divisor / 2) // divisor * divisor)
+        if width_out < 0.9 * width:
+            width_out += divisor
+        return int(width_out)
+
+    def __init__(self, dim_in, ratio, relu_act=True):
+        """
+        Args:
+            dim_in (int): the channel dimensions of the input.
+            ratio (float): the channel reduction ratio for squeeze.
+            relu_act (bool): whether to use ReLU activation instead
+                of Swish (default).
+            divisor (int): the new width should be dividable by divisor.
+        """
+        super(SE, self).__init__()
+        self.avg_pool = nn.AdaptiveAvgPool3d((1, 1, 1))
+        dim_fc = self._round_width(dim_in, ratio)
+        self.fc1 = nn.Conv3d(dim_in, dim_fc, 1, bias=True)
+        self.fc1_act = nn.ReLU() if relu_act else Swish()
+        self.fc2 = nn.Conv3d(dim_fc, dim_in, 1, bias=True)
+
+        self.fc2_sig = nn.Sigmoid()
+
+    def forward(self, x):
+        x_in = x
+        for module in self.children():
+            x = module(x)
+        return x_in * x
+
+
+
+
+def get_trans_func(name):
+    """
+    Retrieves the transformation module by name.
+    """
+    trans_funcs = {
+        "bottleneck_transform": BottleneckTransform,
+        "basic_transform": BasicTransform,
+        "x3d_transform": X3DTransform,
+    }
+    assert (
+        name in trans_funcs.keys()
+    ), "Transformation function '{}' not supported".format(name)
+    return trans_funcs[name]
+
+
+class BasicTransform(nn.Module):
+    """
+    Basic transformation: Tx3x3, 1x3x3, where T is the size of temporal kernel.
+    """
+
+    def __init__(
+        self,
+        dim_in,
+        dim_out,
+        temp_kernel_size,
+        stride,
+        dim_inner=None,
+        num_groups=1,
+        stride_1x1=None,
+        inplace_relu=True,
+        eps=1e-5,
+        bn_mmt=0.1,
+        dilation=1,
+        norm_module=nn.BatchNorm3d,
+        block_idx=0,
+    ):
+        """
+        Args:
+            dim_in (int): the channel dimensions of the input.
+            dim_out (int): the channel dimension of the output.
+            temp_kernel_size (int): the temporal kernel sizes of the first
+                convolution in the basic block.
+            stride (int): the stride of the bottleneck.
+            dim_inner (None): the inner dimension would not be used in
+                BasicTransform.
+            num_groups (int): number of groups for the convolution. Number of
+                group is always 1 for BasicTransform.
+            stride_1x1 (None): stride_1x1 will not be used in BasicTransform.
+            inplace_relu (bool): if True, calculate the relu on the original
+                input without allocating new memory.
+            eps (float): epsilon for batch norm.
+            bn_mmt (float): momentum for batch norm. Noted that BN momentum in
+                PyTorch = 1 - BN momentum in Caffe2.
+            norm_module (nn.Module): nn.Module for the normalization layer. The
+                default is nn.BatchNorm3d.
+        """
+        super(BasicTransform, self).__init__()
+        self.temp_kernel_size = temp_kernel_size
+        self._inplace_relu = inplace_relu
+        self._eps = eps
+        self._bn_mmt = bn_mmt
+        self._construct(dim_in, dim_out, stride, dilation, norm_module)
+
+    def _construct(self, dim_in, dim_out, stride, dilation, norm_module):
+        # Tx3x3, BN, ReLU.
+        self.a = nn.Conv3d(
+            dim_in,
+            dim_out,
+            kernel_size=[self.temp_kernel_size, 3, 3],
+            stride=[1, stride, stride],
+            padding=[int(self.temp_kernel_size // 2), 1, 1],
+            bias=False,
+        )
+        self.a_bn = norm_module(
+            num_features=dim_out, eps=self._eps, momentum=self._bn_mmt
+        )
+        self.a_relu = nn.ReLU(inplace=self._inplace_relu)
+        # 1x3x3, BN.
+        self.b = nn.Conv3d(
+            dim_out,
+            dim_out,
+            kernel_size=[1, 3, 3],
+            stride=[1, 1, 1],
+            padding=[0, dilation, dilation],
+            dilation=[1, dilation, dilation],
+            bias=False,
+        )
+
+        self.b.final_conv = True
+
+        self.b_bn = norm_module(
+            num_features=dim_out, eps=self._eps, momentum=self._bn_mmt
+        )
+
+        self.b_bn.transform_final_bn = True
+
+    def forward(self, x):
+        x = self.a(x)
+        x = self.a_bn(x)
+        x = self.a_relu(x)
+
+        x = self.b(x)
+        x = self.b_bn(x)
+        return x
+
+
+class X3DTransform(nn.Module):
+    """
+    X3D transformation: 1x1x1, Tx3x3 (channelwise, num_groups=dim_in), 1x1x1,
+        augmented with (optional) SE (squeeze-excitation) on the 3x3x3 output.
+        T is the temporal kernel size (defaulting to 3)
+    """
+
+    def __init__(
+        self,
+        dim_in,
+        dim_out,
+        temp_kernel_size,
+        stride,
+        dim_inner,
+        num_groups,
+        stride_1x1=False,
+        inplace_relu=True,
+        eps=1e-5,
+        bn_mmt=0.1,
+        dilation=1,
+        norm_module=nn.BatchNorm3d,
+        se_ratio=0.0625,
+        swish_inner=True,
+        block_idx=0,
+    ):
+        """
+        Args:
+            dim_in (int): the channel dimensions of the input.
+            dim_out (int): the channel dimension of the output.
+            temp_kernel_size (int): the temporal kernel sizes of the middle
+                convolution in the bottleneck.
+            stride (int): the stride of the bottleneck.
+            dim_inner (int): the inner dimension of the block.
+            num_groups (int): number of groups for the convolution. num_groups=1
+                is for standard ResNet like networks, and num_groups>1 is for
+                ResNeXt like networks.
+            stride_1x1 (bool): if True, apply stride to 1x1 conv, otherwise
+                apply stride to the 3x3 conv.
+            inplace_relu (bool): if True, calculate the relu on the original
+                input without allocating new memory.
+            eps (float): epsilon for batch norm.
+            bn_mmt (float): momentum for batch norm. Noted that BN momentum in
+                PyTorch = 1 - BN momentum in Caffe2.
+            dilation (int): size of dilation.
+            norm_module (nn.Module): nn.Module for the normalization layer. The
+                default is nn.BatchNorm3d.
+            se_ratio (float): if > 0, apply SE to the Tx3x3 conv, with the SE
+                channel dimensionality being se_ratio times the Tx3x3 conv dim.
+            swish_inner (bool): if True, apply swish to the Tx3x3 conv, otherwise
+                apply ReLU to the Tx3x3 conv.
+        """
+        super(X3DTransform, self).__init__()
+        self.temp_kernel_size = temp_kernel_size
+        self._inplace_relu = inplace_relu
+        self._eps = eps
+        self._bn_mmt = bn_mmt
+        self._se_ratio = se_ratio
+        self._swish_inner = swish_inner
+        self._stride_1x1 = stride_1x1
+        self._block_idx = block_idx
+        self._construct(
+            dim_in,
+            dim_out,
+            stride,
+            dim_inner,
+            num_groups,
+            dilation,
+            norm_module,
+        )
+
+    def _construct(
+        self,
+        dim_in,
+        dim_out,
+        stride,
+        dim_inner,
+        num_groups,
+        dilation,
+        norm_module,
+    ):
+        (str1x1, str3x3) = (stride, 1) if self._stride_1x1 else (1, stride)
+
+        # 1x1x1, BN, ReLU.
+        self.a = nn.Conv3d(
+            dim_in,
+            dim_inner,
+            kernel_size=[1, 1, 1],
+            stride=[1, str1x1, str1x1],
+            padding=[0, 0, 0],
+            bias=False,
+        )
+        self.a_bn = norm_module(
+            num_features=dim_inner, eps=self._eps, momentum=self._bn_mmt
+        )
+        self.a_relu = nn.ReLU(inplace=self._inplace_relu)
+
+        # Tx3x3, BN, ReLU.
+        self.b = nn.Conv3d(
+            dim_inner,
+            dim_inner,
+            [self.temp_kernel_size, 3, 3],
+            stride=[1, str3x3, str3x3],
+            padding=[int(self.temp_kernel_size // 2), dilation, dilation],
+            groups=num_groups,
+            bias=False,
+            dilation=[1, dilation, dilation],
+        )
+        self.b_bn = norm_module(
+            num_features=dim_inner, eps=self._eps, momentum=self._bn_mmt
+        )
+
+        # Apply SE attention or not
+        use_se = True if (self._block_idx + 1) % 2 else False
+        if self._se_ratio > 0.0 and use_se:
+            self.se = SE(dim_inner, self._se_ratio)
+
+        if self._swish_inner:
+            self.b_relu = Swish()
+        else:
+            self.b_relu = nn.ReLU(inplace=self._inplace_relu)
+
+        # 1x1x1, BN.
+        self.c = nn.Conv3d(
+            dim_inner,
+            dim_out,
+            kernel_size=[1, 1, 1],
+            stride=[1, 1, 1],
+            padding=[0, 0, 0],
+            bias=False,
+        )
+        self.c_bn = norm_module(
+            num_features=dim_out, eps=self._eps, momentum=self._bn_mmt
+        )
+        self.c_bn.transform_final_bn = True
+
+    def forward(self, x):
+        for block in self.children():
+            x = block(x)
+        return x
+
+
+class BottleneckTransform(nn.Module):
+    """
+    Bottleneck transformation: Tx1x1, 1x3x3, 1x1x1, where T is the size of
+        temporal kernel.
+    """
+
+    def __init__(
+        self,
+        dim_in,
+        dim_out,
+        temp_kernel_size,
+        stride,
+        dim_inner,
+        num_groups,
+        stride_1x1=False,
+        inplace_relu=True,
+        eps=1e-5,
+        bn_mmt=0.1,
+        dilation=1,
+        norm_module=nn.BatchNorm3d,
+        block_idx=0,
+    ):
+        """
+        Args:
+            dim_in (int): the channel dimensions of the input.
+            dim_out (int): the channel dimension of the output.
+            temp_kernel_size (int): the temporal kernel sizes of the first
+                convolution in the bottleneck.
+            stride (int): the stride of the bottleneck.
+            dim_inner (int): the inner dimension of the block.
+            num_groups (int): number of groups for the convolution. num_groups=1
+                is for standard ResNet like networks, and num_groups>1 is for
+                ResNeXt like networks.
+            stride_1x1 (bool): if True, apply stride to 1x1 conv, otherwise
+                apply stride to the 3x3 conv.
+            inplace_relu (bool): if True, calculate the relu on the original
+                input without allocating new memory.
+            eps (float): epsilon for batch norm.
+            bn_mmt (float): momentum for batch norm. Noted that BN momentum in
+                PyTorch = 1 - BN momentum in Caffe2.
+            dilation (int): size of dilation.
+            norm_module (nn.Module): nn.Module for the normalization layer. The
+                default is nn.BatchNorm3d.
+        """
+        super(BottleneckTransform, self).__init__()
+        self.temp_kernel_size = temp_kernel_size
+        self._inplace_relu = inplace_relu
+        self._eps = eps
+        self._bn_mmt = bn_mmt
+        self._stride_1x1 = stride_1x1
+        self._construct(
+            dim_in,
+            dim_out,
+            stride,
+            dim_inner,
+            num_groups,
+            dilation,
+            norm_module,
+        )
+
+    def _construct(
+        self,
+        dim_in,
+        dim_out,
+        stride,
+        dim_inner,
+        num_groups,
+        dilation,
+        norm_module,
+    ):
+        (str1x1, str3x3) = (stride, 1) if self._stride_1x1 else (1, stride)
+
+        # Tx1x1, BN, ReLU.
+        self.a = nn.Conv3d(
+            dim_in,
+            dim_inner,
+            kernel_size=[self.temp_kernel_size, 1, 1],
+            stride=[1, str1x1, str1x1],
+            padding=[int(self.temp_kernel_size // 2), 0, 0],
+            bias=False,
+        )
+        self.a_bn = norm_module(
+            num_features=dim_inner, eps=self._eps, momentum=self._bn_mmt
+        )
+        self.a_relu = nn.ReLU(inplace=self._inplace_relu)
+
+        # 1x3x3, BN, ReLU.
+        self.b = nn.Conv3d(
+            dim_inner,
+            dim_inner,
+            [1, 3, 3],
+            stride=[1, str3x3, str3x3],
+            padding=[0, dilation, dilation],
+            groups=num_groups,
+            bias=False,
+            dilation=[1, dilation, dilation],
+        )
+        self.b_bn = norm_module(
+            num_features=dim_inner, eps=self._eps, momentum=self._bn_mmt
+        )
+        self.b_relu = nn.ReLU(inplace=self._inplace_relu)
+
+        # 1x1x1, BN.
+        self.c = nn.Conv3d(
+            dim_inner,
+            dim_out,
+            kernel_size=[1, 1, 1],
+            stride=[1, 1, 1],
+            padding=[0, 0, 0],
+            bias=False,
+        )
+        self.c.final_conv = True
+
+        self.c_bn = norm_module(
+            num_features=dim_out, eps=self._eps, momentum=self._bn_mmt
+        )
+        self.c_bn.transform_final_bn = True
+
+    def forward(self, x):
+        # Explicitly forward every layer.
+        # Branch2a.
+        x = self.a(x)
+        x = self.a_bn(x)
+        x = self.a_relu(x)
+
+        # Branch2b.
+        x = self.b(x)
+        x = self.b_bn(x)
+        x = self.b_relu(x)
+
+        # Branch2c
+        x = self.c(x)
+        x = self.c_bn(x)
+        return x
+
+
+class ResBlock(nn.Module):
+    """
+    Residual block.
+    """
+
+    def __init__(
+        self,
+        dim_in,
+        dim_out,
+        temp_kernel_size,
+        stride,
+        trans_func,
+        dim_inner,
+        num_groups=1,
+        stride_1x1=False,
+        inplace_relu=True,
+        eps=1e-5,
+        bn_mmt=0.1,
+        dilation=1,
+        norm_module=nn.BatchNorm3d,
+        block_idx=0,
+        drop_connect_rate=0.0,
+    ):
+        """
+        ResBlock class constructs redisual blocks. More details can be found in:
+            Kaiming He, Xiangyu Zhang, Shaoqing Ren, and Jian Sun.
+            "Deep residual learning for image recognition."
+            https://arxiv.org/abs/1512.03385
+        Args:
+            dim_in (int): the channel dimensions of the input.
+            dim_out (int): the channel dimension of the output.
+            temp_kernel_size (int): the temporal kernel sizes of the middle
+                convolution in the bottleneck.
+            stride (int): the stride of the bottleneck.
+            trans_func (string): transform function to be used to construct the
+                bottleneck.
+            dim_inner (int): the inner dimension of the block.
+            num_groups (int): number of groups for the convolution. num_groups=1
+                is for standard ResNet like networks, and num_groups>1 is for
+                ResNeXt like networks.
+            stride_1x1 (bool): if True, apply stride to 1x1 conv, otherwise
+                apply stride to the 3x3 conv.
+            inplace_relu (bool): calculate the relu on the original input
+                without allocating new memory.
+            eps (float): epsilon for batch norm.
+            bn_mmt (float): momentum for batch norm. Noted that BN momentum in
+                PyTorch = 1 - BN momentum in Caffe2.
+            dilation (int): size of dilation.
+            norm_module (nn.Module): nn.Module for the normalization layer. The
+                default is nn.BatchNorm3d.
+            drop_connect_rate (float): basic rate at which blocks are dropped,
+                linearly increases from input to output blocks.
+        """
+        super(ResBlock, self).__init__()
+        self._inplace_relu = inplace_relu
+        self._eps = eps
+        self._bn_mmt = bn_mmt
+        self._drop_connect_rate = drop_connect_rate
+        self._construct(
+            dim_in,
+            dim_out,
+            temp_kernel_size,
+            stride,
+            trans_func,
+            dim_inner,
+            num_groups,
+            stride_1x1,
+            inplace_relu,
+            dilation,
+            norm_module,
+            block_idx,
+        )
+
+    def _construct(
+        self,
+        dim_in,
+        dim_out,
+        temp_kernel_size,
+        stride,
+        trans_func,
+        dim_inner,
+        num_groups,
+        stride_1x1,
+        inplace_relu,
+        dilation,
+        norm_module,
+        block_idx,
+    ):
+        # Use skip connection with projection if dim or res change.
+        if (dim_in != dim_out) or (stride != 1):
+            self.branch1 = nn.Conv3d(
+                dim_in,
+                dim_out,
+                kernel_size=1,
+                stride=[1, stride, stride],
+                padding=0,
+                bias=False,
+                dilation=1,
+            )
+            self.branch1_bn = norm_module(
+                num_features=dim_out, eps=self._eps, momentum=self._bn_mmt
+            )
+        self.branch2 = trans_func(
+            dim_in,
+            dim_out,
+            temp_kernel_size,
+            stride,
+            dim_inner,
+            num_groups,
+            stride_1x1=stride_1x1,
+            inplace_relu=inplace_relu,
+            dilation=dilation,
+            norm_module=norm_module,
+            block_idx=block_idx,
+        )
+        self.relu = nn.ReLU(self._inplace_relu)
+
+    def forward(self, x):
+        f_x = self.branch2(x)
+        if self.training and self._drop_connect_rate > 0.0:
+            f_x = drop_path(f_x, self._drop_connect_rate)
+        if hasattr(self, "branch1"):
+            x = self.branch1_bn(self.branch1(x)) + f_x
+        else:
+            x = x + f_x
+        x = self.relu(x)
+        return x
+
+
+class ResStage(nn.Module):
+    """
+    Stage of 3D ResNet. It expects to have one or more tensors as input for
+        single pathway (C2D, I3D, Slow), and multi-pathway (SlowFast) cases.
+        More details can be found here:
+
+        Christoph Feichtenhofer, Haoqi Fan, Jitendra Malik, and Kaiming He.
+        "SlowFast networks for video recognition."
+        https://arxiv.org/pdf/1812.03982.pdf
+    """
+
+    def __init__(
+        self,
+        dim_in,
+        dim_out,
+        stride,
+        temp_kernel_sizes,
+        num_blocks,
+        dim_inner,
+        num_groups,
+        num_block_temp_kernel,
+        nonlocal_inds,
+        nonlocal_group,
+        nonlocal_pool,
+        dilation,
+        instantiation="softmax",
+        trans_func_name="bottleneck_transform",
+        stride_1x1=False,
+        inplace_relu=True,
+        norm_module=nn.BatchNorm3d,
+        drop_connect_rate=0.0,
+    ):
+        """
+        The `__init__` method of any subclass should also contain these arguments.
+        ResStage builds p streams, where p can be greater or equal to one.
+        Args:
+            dim_in (list): list of p the channel dimensions of the input.
+                Different channel dimensions control the input dimension of
+                different pathways.
+            dim_out (list): list of p the channel dimensions of the output.
+                Different channel dimensions control the input dimension of
+                different pathways.
+            temp_kernel_sizes (list): list of the p temporal kernel sizes of the
+                convolution in the bottleneck. Different temp_kernel_sizes
+                control different pathway.
+            stride (list): list of the p strides of the bottleneck. Different
+                stride control different pathway.
+            num_blocks (list): list of p numbers of blocks for each of the
+                pathway.
+            dim_inner (list): list of the p inner channel dimensions of the
+                input. Different channel dimensions control the input dimension
+                of different pathways.
+            num_groups (list): list of number of p groups for the convolution.
+                num_groups=1 is for standard ResNet like networks, and
+                num_groups>1 is for ResNeXt like networks.
+            num_block_temp_kernel (list): extent the temp_kernel_sizes to
+                num_block_temp_kernel blocks, then fill temporal kernel size
+                of 1 for the rest of the layers.
+            nonlocal_inds (list): If the tuple is empty, no nonlocal layer will
+                be added. If the tuple is not empty, add nonlocal layers after
+                the index-th block.
+            dilation (list): size of dilation for each pathway.
+            nonlocal_group (list): list of number of p nonlocal groups. Each
+                number controls how to fold temporal dimension to batch
+                dimension before applying nonlocal transformation.
+                https://github.com/facebookresearch/video-nonlocal-net.
+            instantiation (string): different instantiation for nonlocal layer.
+                Supports two different instantiation method:
+                    "dot_product": normalizing correlation matrix with L2.
+                    "softmax": normalizing correlation matrix with Softmax.
+            trans_func_name (string): name of the the transformation function apply
+                on the network.
+            norm_module (nn.Module): nn.Module for the normalization layer. The
+                default is nn.BatchNorm3d.
+            drop_connect_rate (float): basic rate at which blocks are dropped,
+                linearly increases from input to output blocks.
+        """
+        super(ResStage, self).__init__()
+        assert all(
+            (
+                num_block_temp_kernel[i] <= num_blocks[i]
+                for i in range(len(temp_kernel_sizes))
+            )
+        )
+        self.num_blocks = num_blocks
+        self.nonlocal_group = nonlocal_group
+        self._drop_connect_rate = drop_connect_rate
+        self.temp_kernel_sizes = [
+            (temp_kernel_sizes[i] * num_blocks[i])[: num_block_temp_kernel[i]]
+            + [1] * (num_blocks[i] - num_block_temp_kernel[i])
+            for i in range(len(temp_kernel_sizes))
+        ]
+        assert (
+            len(
+                {
+                    len(dim_in),
+                    len(dim_out),
+                    len(temp_kernel_sizes),
+                    len(stride),
+                    len(num_blocks),
+                    len(dim_inner),
+                    len(num_groups),
+                    len(num_block_temp_kernel),
+                    len(nonlocal_inds),
+                    len(nonlocal_group),
+                }
+            )
+            == 1
+        )
+        self.num_pathways = len(self.num_blocks)
+        self._construct(
+            dim_in,
+            dim_out,
+            stride,
+            dim_inner,
+            num_groups,
+            trans_func_name,
+            stride_1x1,
+            inplace_relu,
+            nonlocal_inds,
+            nonlocal_pool,
+            instantiation,
+            dilation,
+            norm_module,
+        )
+
+    def _construct(
+        self,
+        dim_in,
+        dim_out,
+        stride,
+        dim_inner,
+        num_groups,
+        trans_func_name,
+        stride_1x1,
+        inplace_relu,
+        nonlocal_inds,
+        nonlocal_pool,
+        instantiation,
+        dilation,
+        norm_module,
+    ):
+        for pathway in range(self.num_pathways):
+            for i in range(self.num_blocks[pathway]):
+                # Retrieve the transformation function.
+                trans_func = get_trans_func(trans_func_name)
+                # Construct the block.
+                res_block = ResBlock(
+                    dim_in[pathway] if i == 0 else dim_out[pathway],
+                    dim_out[pathway],
+                    self.temp_kernel_sizes[pathway][i],
+                    stride[pathway] if i == 0 else 1,
+                    trans_func,
+                    dim_inner[pathway],
+                    num_groups[pathway],
+                    stride_1x1=stride_1x1,
+                    inplace_relu=inplace_relu,
+                    dilation=dilation[pathway],
+                    norm_module=norm_module,
+                    block_idx=i,
+                    drop_connect_rate=self._drop_connect_rate,
+                )
+                self.add_module("pathway{}_res{}".format(
+                    pathway, i), res_block)
+                if i in nonlocal_inds[pathway]:
+                    nln = Nonlocal(
+                        dim_out[pathway],
+                        dim_out[pathway] // 2,
+                        nonlocal_pool[pathway],
+                        instantiation=instantiation,
+                        norm_module=norm_module,
+                    )
+                    self.add_module(
+                        "pathway{}_nonlocal{}".format(pathway, i), nln)
+
+    def forward(self, inputs):
+        output = []
+        for pathway in range(self.num_pathways):
+            x = inputs[pathway]
+            for i in range(self.num_blocks[pathway]):
+                m = getattr(self, "pathway{}_res{}".format(pathway, i))
+                x = m(x)
+                if hasattr(self, "pathway{}_nonlocal{}".format(pathway, i)):
+                    nln = getattr(
+                        self, "pathway{}_nonlocal{}".format(pathway, i))
+                    b, c, t, h, w = x.shape
+                    if self.nonlocal_group[pathway] > 1:
+                        # Fold temporal dimension into batch dimension.
+                        x = x.permute(0, 2, 1, 3, 4)
+                        x = x.reshape(
+                            b * self.nonlocal_group[pathway],
+                            t // self.nonlocal_group[pathway],
+                            c,
+                            h,
+                            w,
+                        )
+                        x = x.permute(0, 2, 1, 3, 4)
+                    x = nln(x)
+                    if self.nonlocal_group[pathway] > 1:
+                        # Fold back to temporal dimension.
+                        x = x.permute(0, 2, 1, 3, 4)
+                        x = x.reshape(b, t, c, h, w)
+                        x = x.permute(0, 2, 1, 3, 4)
+            output.append(x)
+
+        return output
diff --git a/src/kabr_tools/utils/slowfast/stem.py b/src/kabr_tools/utils/slowfast/stem.py
new file mode 100644
index 0000000..beda2ec
--- /dev/null
+++ b/src/kabr_tools/utils/slowfast/stem.py
@@ -0,0 +1,321 @@
+# Copyright (c) Facebook, Inc. and its affiliates. All Rights Reserved.
+
+"""ResNe(X)t 3D stem helper."""
+
+import torch.nn as nn
+
+
+def get_stem_func(name):
+    """
+    Retrieves the stem module by name.
+    """
+    trans_funcs = {"x3d_stem": X3DStem, "basic_stem": ResNetBasicStem}
+    assert (
+        name in trans_funcs.keys()
+    ), "Transformation function '{}' not supported".format(name)
+    return trans_funcs[name]
+
+
+class VideoModelStem(nn.Module):
+    """
+    Video 3D stem module. Provides stem operations of Conv, BN, ReLU, MaxPool
+    on input data tensor for one or multiple pathways.
+    """
+
+    def __init__(
+        self,
+        dim_in,
+        dim_out,
+        kernel,
+        stride,
+        padding,
+        inplace_relu=True,
+        eps=1e-5,
+        bn_mmt=0.1,
+        norm_module=nn.BatchNorm3d,
+        stem_func_name="basic_stem",
+    ):
+        """
+        The `__init__` method of any subclass should also contain these
+        arguments. List size of 1 for single pathway models (C2D, I3D, Slow
+        and etc), list size of 2 for two pathway models (SlowFast).
+
+        Args:
+            dim_in (list): the list of channel dimensions of the inputs.
+            dim_out (list): the output dimension of the convolution in the stem
+                layer.
+            kernel (list): the kernels' size of the convolutions in the stem
+                layers. Temporal kernel size, height kernel size, width kernel
+                size in order.
+            stride (list): the stride sizes of the convolutions in the stem
+                layer. Temporal kernel stride, height kernel size, width kernel
+                size in order.
+            padding (list): the paddings' sizes of the convolutions in the stem
+                layer. Temporal padding size, height padding size, width padding
+                size in order.
+            inplace_relu (bool): calculate the relu on the original input
+                without allocating new memory.
+            eps (float): epsilon for batch norm.
+            bn_mmt (float): momentum for batch norm. Noted that BN momentum in
+                PyTorch = 1 - BN momentum in Caffe2.
+            norm_module (nn.Module): nn.Module for the normalization layer. The
+                default is nn.BatchNorm3d.
+            stem_func_name (string): name of the the stem function applied on
+                input to the network.
+        """
+        super(VideoModelStem, self).__init__()
+
+        assert (
+            len(
+                {
+                    len(dim_in),
+                    len(dim_out),
+                    len(kernel),
+                    len(stride),
+                    len(padding),
+                }
+            )
+            == 1
+        ), "Input pathway dimensions are not consistent. {} {} {} {} {}".format(
+            len(dim_in),
+            len(dim_out),
+            len(kernel),
+            len(stride),
+            len(padding),
+        )
+
+        self.num_pathways = len(dim_in)
+        self.kernel = kernel
+        self.stride = stride
+        self.padding = padding
+        self.inplace_relu = inplace_relu
+        self.eps = eps
+        self.bn_mmt = bn_mmt
+        # Construct the stem layer.
+        self._construct_stem(dim_in, dim_out, norm_module, stem_func_name)
+
+    def _construct_stem(self, dim_in, dim_out, norm_module, stem_func_name):
+        trans_func = get_stem_func(stem_func_name)
+
+        for pathway in range(len(dim_in)):
+            stem = trans_func(
+                dim_in[pathway],
+                dim_out[pathway],
+                self.kernel[pathway],
+                self.stride[pathway],
+                self.padding[pathway],
+                self.inplace_relu,
+                self.eps,
+                self.bn_mmt,
+                norm_module,
+            )
+            self.add_module("pathway{}_stem".format(pathway), stem)
+
+    def forward(self, x):
+        assert (
+            len(x) == self.num_pathways
+        ), "Input tensor does not contain {} pathway".format(self.num_pathways)
+        # use a new list, don't modify in-place the x list, which is bad for activation checkpointing.
+        y = []
+        for pathway in range(len(x)):
+            m = getattr(self, "pathway{}_stem".format(pathway))
+            y.append(m(x[pathway]))
+        return y
+
+
+class ResNetBasicStem(nn.Module):
+    """
+    ResNe(X)t 3D stem module.
+    Performs spatiotemporal Convolution, BN, and Relu following by a
+        spatiotemporal pooling.
+    """
+
+    def __init__(
+        self,
+        dim_in,
+        dim_out,
+        kernel,
+        stride,
+        padding,
+        inplace_relu=True,
+        eps=1e-5,
+        bn_mmt=0.1,
+        norm_module=nn.BatchNorm3d,
+    ):
+        """
+        The `__init__` method of any subclass should also contain these arguments.
+
+        Args:
+            dim_in (int): the channel dimension of the input. Normally 3 is used
+                for rgb input, and 2 or 3 is used for optical flow input.
+            dim_out (int): the output dimension of the convolution in the stem
+                layer.
+            kernel (list): the kernel size of the convolution in the stem layer.
+                temporal kernel size, height kernel size, width kernel size in
+                order.
+            stride (list): the stride size of the convolution in the stem layer.
+                temporal kernel stride, height kernel size, width kernel size in
+                order.
+            padding (int): the padding size of the convolution in the stem
+                layer, temporal padding size, height padding size, width
+                padding size in order.
+            inplace_relu (bool): calculate the relu on the original input
+                without allocating new memory.
+            eps (float): epsilon for batch norm.
+            bn_mmt (float): momentum for batch norm. Noted that BN momentum in
+                PyTorch = 1 - BN momentum in Caffe2.
+            norm_module (nn.Module): nn.Module for the normalization layer. The
+                default is nn.BatchNorm3d.
+        """
+        super(ResNetBasicStem, self).__init__()
+        self.kernel = kernel
+        self.stride = stride
+        self.padding = padding
+        self.inplace_relu = inplace_relu
+        self.eps = eps
+        self.bn_mmt = bn_mmt
+        # Construct the stem layer.
+        self._construct_stem(dim_in, dim_out, norm_module)
+
+    def _construct_stem(self, dim_in, dim_out, norm_module):
+        self.conv = nn.Conv3d(
+            dim_in,
+            dim_out,
+            self.kernel,
+            stride=self.stride,
+            padding=self.padding,
+            bias=False,
+        )
+        self.bn = norm_module(num_features=dim_out,
+                              eps=self.eps, momentum=self.bn_mmt)
+        self.relu = nn.ReLU(self.inplace_relu)
+        self.pool_layer = nn.MaxPool3d(
+            kernel_size=[1, 3, 3], stride=[1, 2, 2], padding=[0, 1, 1]
+        )
+
+    def forward(self, x):
+        x = self.conv(x)
+        x = self.bn(x)
+        x = self.relu(x)
+        x = self.pool_layer(x)
+        return x
+
+
+class X3DStem(nn.Module):
+    """
+    X3D's 3D stem module.
+    Performs a spatial followed by a depthwise temporal Convolution, BN, and Relu following by a
+        spatiotemporal pooling.
+    """
+
+    def __init__(
+        self,
+        dim_in,
+        dim_out,
+        kernel,
+        stride,
+        padding,
+        inplace_relu=True,
+        eps=1e-5,
+        bn_mmt=0.1,
+        norm_module=nn.BatchNorm3d,
+    ):
+        """
+        The `__init__` method of any subclass should also contain these arguments.
+
+        Args:
+            dim_in (int): the channel dimension of the input. Normally 3 is used
+                for rgb input, and 2 or 3 is used for optical flow input.
+            dim_out (int): the output dimension of the convolution in the stem
+                layer.
+            kernel (list): the kernel size of the convolution in the stem layer.
+                temporal kernel size, height kernel size, width kernel size in
+                order.
+            stride (list): the stride size of the convolution in the stem layer.
+                temporal kernel stride, height kernel size, width kernel size in
+                order.
+            padding (int): the padding size of the convolution in the stem
+                layer, temporal padding size, height padding size, width
+                padding size in order.
+            inplace_relu (bool): calculate the relu on the original input
+                without allocating new memory.
+            eps (float): epsilon for batch norm.
+            bn_mmt (float): momentum for batch norm. Noted that BN momentum in
+                PyTorch = 1 - BN momentum in Caffe2.
+            norm_module (nn.Module): nn.Module for the normalization layer. The
+                default is nn.BatchNorm3d.
+        """
+        super(X3DStem, self).__init__()
+        self.kernel = kernel
+        self.stride = stride
+        self.padding = padding
+        self.inplace_relu = inplace_relu
+        self.eps = eps
+        self.bn_mmt = bn_mmt
+        # Construct the stem layer.
+        self._construct_stem(dim_in, dim_out, norm_module)
+
+    def _construct_stem(self, dim_in, dim_out, norm_module):
+        self.conv_xy = nn.Conv3d(
+            dim_in,
+            dim_out,
+            kernel_size=(1, self.kernel[1], self.kernel[2]),
+            stride=(1, self.stride[1], self.stride[2]),
+            padding=(0, self.padding[1], self.padding[2]),
+            bias=False,
+        )
+        self.conv = nn.Conv3d(
+            dim_out,
+            dim_out,
+            kernel_size=(self.kernel[0], 1, 1),
+            stride=(self.stride[0], 1, 1),
+            padding=(self.padding[0], 0, 0),
+            bias=False,
+            groups=dim_out,
+        )
+
+        self.bn = norm_module(num_features=dim_out,
+                              eps=self.eps, momentum=self.bn_mmt)
+        self.relu = nn.ReLU(self.inplace_relu)
+
+    def forward(self, x):
+        x = self.conv_xy(x)
+        x = self.conv(x)
+        x = self.bn(x)
+        x = self.relu(x)
+        return x
+
+
+class PatchEmbed(nn.Module):
+    """
+    PatchEmbed.
+    """
+
+    def __init__(
+        self,
+        dim_in=3,
+        dim_out=768,
+        kernel=(1, 16, 16),
+        stride=(1, 4, 4),
+        padding=(1, 7, 7),
+        conv_2d=False,
+    ):
+        super().__init__()
+        if conv_2d:
+            conv = nn.Conv2d
+        else:
+            conv = nn.Conv3d
+        self.proj = conv(
+            dim_in,
+            dim_out,
+            kernel_size=kernel,
+            stride=stride,
+            padding=padding,
+        )
+
+    def forward(self, x, keep_spatial=False):
+        x = self.proj(x)
+        if keep_spatial:
+            return x, x.shape
+        # B C (T) H W -> B (T)HW C
+        return x.flatten(2).transpose(1, 2), x.shape
diff --git a/src/kabr_tools/utils/slowfast/utils.py b/src/kabr_tools/utils/slowfast/utils.py
new file mode 100644
index 0000000..ec3408d
--- /dev/null
+++ b/src/kabr_tools/utils/slowfast/utils.py
@@ -0,0 +1,115 @@
+# Copyright (c) Facebook, Inc. and its affiliates. All Rights Reserved.
+# https://github.com/facebookresearch/SlowFast/blob/bac7b672f40d44166a84e8c51d1a5ba367ace816/slowfast/visualization/ava_demo_precomputed_boxes.py
+
+import math
+import cv2
+import torch
+import numpy as np
+from torch import Tensor
+
+
+def get_sequence(center_idx, half_len, sample_rate, num_frames):
+    seq = list(range(center_idx - half_len, center_idx + half_len, sample_rate))
+
+    for seq_idx in range(len(seq)):
+        if seq[seq_idx] < 0:
+            seq[seq_idx] = 0
+        elif seq[seq_idx] >= num_frames:
+            seq[seq_idx] = num_frames - 1
+    return seq
+
+
+def scale(size, image):
+    height = image.shape[0]
+    width = image.shape[1]
+    if (width <= height and width == size) or (height <= width and height == size):
+        return image
+    new_width = size
+    new_height = size
+    if width < height:
+        new_height = int(math.floor((float(height) / width) * size))
+    else:
+        new_width = int(math.floor((float(width) / height) * size))
+    img = cv2.resize(image, (new_width, new_height),
+                     interpolation=cv2.INTER_LINEAR)
+    return img.astype(np.float32)
+
+
+def process_cv2_inputs(frames, cfg):
+    inputs = torch.from_numpy(np.array(frames)).float() / 255
+    inputs = tensor_normalize(inputs, cfg.DATA.MEAN, cfg.DATA.STD)
+    # T H W C -> C T H W.
+    inputs = inputs.permute(3, 0, 1, 2)
+    # Sample frames for num_frames specified.
+    index = torch.linspace(0, inputs.shape[1] - 1, cfg.DATA.NUM_FRAMES).long()
+    inputs = torch.index_select(inputs, 1, index)
+    inputs = pack_pathway_output(cfg, inputs)
+    inputs = [inp.unsqueeze(0) for inp in inputs]
+    return inputs
+
+
+def tensor_normalize(tensor, mean, std, func=None):
+    if tensor.dtype == torch.uint8:
+        tensor = tensor.float()
+        tensor = tensor / 255.0
+    if type(mean) == list:
+        mean = torch.tensor(mean)
+    if type(std) == list:
+        std = torch.tensor(std)
+    if func is not None:
+        tensor = func(tensor)
+    tensor = tensor - mean
+    tensor = tensor / std
+    return tensor
+
+
+def pack_pathway_output(cfg, frames):
+    if cfg.DATA.REVERSE_INPUT_CHANNEL:
+        frames = frames[[2, 1, 0], :, :, :]
+    if cfg.MODEL.ARCH in cfg.MODEL.SINGLE_PATHWAY_ARCH:
+        frame_list = [frames]
+    elif cfg.MODEL.ARCH in cfg.MODEL.MULTI_PATHWAY_ARCH:
+        fast_pathway = frames
+        # Perform temporal sampling from the fast pathway.
+        slow_pathway = torch.index_select(
+            frames,
+            1,
+            torch.linspace(
+                0, frames.shape[1] - 1, frames.shape[1] // cfg.SLOWFAST.ALPHA
+            ).long(),
+        )
+        frame_list = [slow_pathway, fast_pathway]
+    else:
+        raise NotImplementedError(
+            "Model arch {} is not in {}".format(
+                cfg.MODEL.ARCH,
+                cfg.MODEL.SINGLE_PATHWAY_ARCH + cfg.MODEL.MULTI_PATHWAY_ARCH,
+            )
+        )
+    return frame_list
+
+
+def get_input_clip(cap: cv2.VideoCapture, cfg, keyframe_idx: int) -> list[Tensor]:
+
+    seq_length = cfg.DATA.NUM_FRAMES * cfg.DATA.SAMPLING_RATE
+    total_frames = int(cap.get(cv2.CAP_PROP_FRAME_COUNT))
+    seq = get_sequence(
+        keyframe_idx,
+        seq_length // 2,
+        cfg.DATA.SAMPLING_RATE,
+        total_frames,
+    )
+    clip = []
+    for frame_idx in seq:
+        cap.set(cv2.CAP_PROP_POS_FRAMES, frame_idx)
+        was_read, frame = cap.read()
+        if was_read:
+            frame = cv2.cvtColor(frame, cv2.COLOR_BGR2RGB)
+            frame = scale(cfg.DATA.TEST_CROP_SIZE, frame)
+            clip.append(frame)
+        else:
+            print("Unable to read frame. Duplicating previous frame.")
+            clip.append(clip[-1])
+
+    clip = process_cv2_inputs(clip, cfg)
+    return clip
diff --git a/src/kabr_tools/utils/slowfast/x3d.py b/src/kabr_tools/utils/slowfast/x3d.py
new file mode 100644
index 0000000..628f51e
--- /dev/null
+++ b/src/kabr_tools/utils/slowfast/x3d.py
@@ -0,0 +1,352 @@
+# Copyright (c) Facebook, Inc. and its affiliates. All Rights Reserved.
+
+import math
+import torch
+from torch import nn
+from torch.distributed.algorithms.ddp_comm_hooks import default as comm_hooks_default
+from fvcore.nn.weight_init import c2_msra_fill, c2_xavier_fill
+
+from .norm import get_norm
+from .stem import VideoModelStem
+from .resnet import ResStage
+from .head import X3DHead
+
+# round width
+
+
+def round_width(width, multiplier, min_width=1, divisor=1):
+    if not multiplier:
+        return width
+    width *= multiplier
+    min_width = min_width or divisor
+    width_out = max(min_width, int(width + divisor / 2) // divisor * divisor)
+    if width_out < 0.9 * width:
+        width_out += divisor
+    return int(width_out)
+
+# init weights
+
+
+def init_weights(
+    model, fc_init_std=0.01, zero_init_final_bn=True, zero_init_final_conv=False
+):
+    """
+    Performs ResNet style weight initialization.
+    Args:
+        fc_init_std (float): the expected standard deviation for fc layer.
+        zero_init_final_bn (bool): if True, zero initialize the final bn for
+            every bottleneck.
+    """
+    for m in model.modules():
+        if isinstance(m, nn.Conv3d):
+            # Note that there is no bias due to BN
+            if hasattr(m, "final_conv") and zero_init_final_conv:
+                m.weight.data.zero_()
+            else:
+                """
+                Follow the initialization method proposed in:
+                {He, Kaiming, et al.
+                "Delving deep into rectifiers: Surpassing human-level
+                performance on imagenet classification."
+                arXiv preprint arXiv:1502.01852 (2015)}
+                """
+                c2_msra_fill(m)
+
+        elif isinstance(m, (nn.BatchNorm3d, nn.BatchNorm2d, nn.BatchNorm1d)):
+            if (
+                hasattr(m, "transform_final_bn")
+                and m.transform_final_bn
+                and zero_init_final_bn
+            ):
+                batchnorm_weight = 0.0
+            else:
+                batchnorm_weight = 1.0
+            if m.weight is not None:
+                m.weight.data.fill_(batchnorm_weight)
+            if m.bias is not None:
+                m.bias.data.zero_()
+        if isinstance(m, nn.Linear):
+            if hasattr(m, "xavier_init") and m.xavier_init:
+                c2_xavier_fill(m)
+            else:
+                m.weight.data.normal_(mean=0.0, std=fc_init_std)
+            if m.bias is not None:
+                m.bias.data.zero_()
+
+
+# pool1
+
+_POOL1 = {
+    "2d": [[1, 1, 1]],
+    "c2d": [[2, 1, 1]],
+    "slow_c2d": [[1, 1, 1]],
+    "i3d": [[2, 1, 1]],
+    "slow_i3d": [[1, 1, 1]],
+    "slow": [[1, 1, 1]],
+    "slowfast": [[1, 1, 1], [1, 1, 1]],
+    "x3d": [[1, 1, 1]],
+}
+
+# temporal kernel basis
+
+_TEMPORAL_KERNEL_BASIS = {
+    "2d": [
+        [[1]],  # conv1 temporal kernel.
+        [[1]],  # res2 temporal kernel.
+        [[1]],  # res3 temporal kernel.
+        [[1]],  # res4 temporal kernel.
+        [[1]],  # res5 temporal kernel.
+    ],
+    "c2d": [
+        [[1]],  # conv1 temporal kernel.
+        [[1]],  # res2 temporal kernel.
+        [[1]],  # res3 temporal kernel.
+        [[1]],  # res4 temporal kernel.
+        [[1]],  # res5 temporal kernel.
+    ],
+    "slow_c2d": [
+        [[1]],  # conv1 temporal kernel.
+        [[1]],  # res2 temporal kernel.
+        [[1]],  # res3 temporal kernel.
+        [[1]],  # res4 temporal kernel.
+        [[1]],  # res5 temporal kernel.
+    ],
+    "i3d": [
+        [[5]],  # conv1 temporal kernel.
+        [[3]],  # res2 temporal kernel.
+        [[3, 1]],  # res3 temporal kernel.
+        [[3, 1]],  # res4 temporal kernel.
+        [[1, 3]],  # res5 temporal kernel.
+    ],
+    "slow_i3d": [
+        [[5]],  # conv1 temporal kernel.
+        [[3]],  # res2 temporal kernel.
+        [[3, 1]],  # res3 temporal kernel.
+        [[3, 1]],  # res4 temporal kernel.
+        [[1, 3]],  # res5 temporal kernel.
+    ],
+    "slow": [
+        [[1]],  # conv1 temporal kernel.
+        [[1]],  # res2 temporal kernel.
+        [[1]],  # res3 temporal kernel.
+        [[3]],  # res4 temporal kernel.
+        [[3]],  # res5 temporal kernel.
+    ],
+    "slowfast": [
+        [[1], [5]],  # conv1 temporal kernel for slow and fast pathway.
+        [[1], [3]],  # res2 temporal kernel for slow and fast pathway.
+        [[1], [3]],  # res3 temporal kernel for slow and fast pathway.
+        [[3], [3]],  # res4 temporal kernel for slow and fast pathway.
+        [[3], [3]],  # res5 temporal kernel for slow and fast pathway.
+    ],
+    "x3d": [
+        [[5]],  # conv1 temporal kernels.
+        [[3]],  # res2 temporal kernels.
+        [[3]],  # res3 temporal kernels.
+        [[3]],  # res4 temporal kernels.
+        [[3]],  # res5 temporal kernels.
+    ],
+}
+
+# model stage depth
+
+_MODEL_STAGE_DEPTH = {18: (2, 2, 2, 2), 50: (3, 4, 6, 3), 101: (3, 4, 23, 3)}
+
+# X3D model
+
+
+class X3D(nn.Module):
+    """
+    X3D model builder. It builds a X3D network backbone, which is a ResNet.
+
+    Christoph Feichtenhofer.
+    "X3D: Expanding Architectures for Efficient Video Recognition."
+    https://arxiv.org/abs/2004.04730
+    """
+
+    def __init__(self, cfg):
+        """
+        The `__init__` method of any subclass should also contain these
+            arguments.
+
+        Args:
+            cfg (CfgNode): model building configs, details are in the
+                comments of the config file.
+        """
+        super(X3D, self).__init__()
+        self.norm_module = get_norm(cfg)
+        self.enable_detection = cfg.DETECTION.ENABLE
+        self.num_pathways = 1
+
+        exp_stage = 2.0
+        self.dim_c1 = cfg.X3D.DIM_C1
+
+        self.dim_res2 = (
+            round_width(self.dim_c1, exp_stage, divisor=8)
+            if cfg.X3D.SCALE_RES2
+            else self.dim_c1
+        )
+        self.dim_res3 = round_width(self.dim_res2, exp_stage, divisor=8)
+        self.dim_res4 = round_width(self.dim_res3, exp_stage, divisor=8)
+        self.dim_res5 = round_width(self.dim_res4, exp_stage, divisor=8)
+
+        self.block_basis = [
+            # blocks, c, stride
+            [1, self.dim_res2, 2],
+            [2, self.dim_res3, 2],
+            [5, self.dim_res4, 2],
+            [3, self.dim_res5, 2],
+        ]
+        self._construct_network(cfg)
+        init_weights(
+            self, cfg.MODEL.FC_INIT_STD, cfg.RESNET.ZERO_INIT_FINAL_BN
+        )
+
+    def _round_repeats(self, repeats, multiplier):
+        """Round number of layers based on depth multiplier."""
+        if not multiplier:
+            return repeats
+        return int(math.ceil(multiplier * repeats))
+
+    def _construct_network(self, cfg):
+        """
+        Builds a single pathway X3D model.
+
+        Args:
+            cfg (CfgNode): model building configs, details are in the
+                comments of the config file.
+        """
+        assert cfg.MODEL.ARCH in _POOL1.keys()
+        assert cfg.RESNET.DEPTH in _MODEL_STAGE_DEPTH.keys()
+
+        (d2, d3, d4, d5) = _MODEL_STAGE_DEPTH[cfg.RESNET.DEPTH]
+
+        num_groups = cfg.RESNET.NUM_GROUPS
+        width_per_group = cfg.RESNET.WIDTH_PER_GROUP
+        dim_inner = num_groups * width_per_group
+
+        w_mul = cfg.X3D.WIDTH_FACTOR
+        d_mul = cfg.X3D.DEPTH_FACTOR
+        dim_res1 = round_width(self.dim_c1, w_mul)
+
+        temp_kernel = _TEMPORAL_KERNEL_BASIS[cfg.MODEL.ARCH]
+
+        self.s1 = VideoModelStem(
+            dim_in=cfg.DATA.INPUT_CHANNEL_NUM,
+            dim_out=[dim_res1],
+            kernel=[temp_kernel[0][0] + [3, 3]],
+            stride=[[1, 2, 2]],
+            padding=[[temp_kernel[0][0][0] // 2, 1, 1]],
+            norm_module=self.norm_module,
+            stem_func_name="x3d_stem",
+        )
+
+        # blob_in = s1
+        dim_in = dim_res1
+        for stage, block in enumerate(self.block_basis):
+            dim_out = round_width(block[1], w_mul)
+            dim_inner = int(cfg.X3D.BOTTLENECK_FACTOR * dim_out)
+
+            n_rep = self._round_repeats(block[0], d_mul)
+            # start w res2 to follow convention
+            prefix = "s{}".format(stage + 2)
+
+            s = ResStage(
+                dim_in=[dim_in],
+                dim_out=[dim_out],
+                dim_inner=[dim_inner],
+                temp_kernel_sizes=temp_kernel[1],
+                stride=[block[2]],
+                num_blocks=[n_rep],
+                num_groups=[dim_inner] if cfg.X3D.CHANNELWISE_3x3x3 else [
+                    num_groups],
+                num_block_temp_kernel=[n_rep],
+                nonlocal_inds=cfg.NONLOCAL.LOCATION[0],
+                nonlocal_group=cfg.NONLOCAL.GROUP[0],
+                nonlocal_pool=cfg.NONLOCAL.POOL[0],
+                instantiation=cfg.NONLOCAL.INSTANTIATION,
+                trans_func_name=cfg.RESNET.TRANS_FUNC,
+                stride_1x1=cfg.RESNET.STRIDE_1X1,
+                norm_module=self.norm_module,
+                dilation=cfg.RESNET.SPATIAL_DILATIONS[stage],
+                drop_connect_rate=cfg.MODEL.DROPCONNECT_RATE
+                * (stage + 2)
+                / (len(self.block_basis) + 1),
+            )
+            dim_in = dim_out
+            self.add_module(prefix, s)
+
+        if self.enable_detection:
+            NotImplementedError
+        else:
+            spat_sz = int(math.ceil(cfg.DATA.TRAIN_CROP_SIZE / 32.0))
+            self.head = X3DHead(
+                dim_in=dim_out,
+                dim_inner=dim_inner,
+                dim_out=cfg.X3D.DIM_C5,
+                num_classes=cfg.MODEL.NUM_CLASSES,
+                pool_size=[cfg.DATA.NUM_FRAMES, spat_sz, spat_sz],
+                dropout_rate=cfg.MODEL.DROPOUT_RATE,
+                act_func=cfg.MODEL.HEAD_ACT,
+                bn_lin5_on=cfg.X3D.BN_LIN5,
+            )
+
+    def forward(self, x, bboxes=None):
+        for module in self.children():
+            x = module(x)
+        return x
+
+
+def build_model(cfg, gpu_id=None):
+    if torch.cuda.is_available():
+        assert (
+            cfg.NUM_GPUS <= torch.cuda.device_count()
+        ), "Cannot use more GPU devices than available"
+    else:
+        assert (
+            cfg.NUM_GPUS == 0
+        ), "Cuda is not available. Please set `NUM_GPUS: 0 for running on CPUs."
+
+    # Construct the model
+    model = X3D(cfg)
+
+    if cfg.BN.NORM_TYPE == "sync_batchnorm_apex":
+        try:
+            import apex
+        except ImportError:
+            raise ImportError(
+                "APEX is required for this model, pelase install")
+
+        process_group = apex.parallel.create_syncbn_process_group(
+            group_size=cfg.BN.NUM_SYNC_DEVICES
+        )
+        model = apex.parallel.convert_syncbn_model(
+            model, process_group=process_group)
+
+    if cfg.NUM_GPUS:
+        if gpu_id is None:
+            # Determine the GPU used by the current process
+            cur_device = torch.cuda.current_device()
+        else:
+            cur_device = gpu_id
+        # Transfer the model to the current GPU device
+        model = model.cuda(device=cur_device)
+    # Use multi-process data parallel model in the multi-gpu setting
+    if cfg.NUM_GPUS > 1:
+        # Make model replica operate on the current device
+        model = torch.nn.parallel.DistributedDataParallel(
+            module=model,
+            device_ids=[cur_device],
+            output_device=cur_device,
+            find_unused_parameters=(
+                True
+                if cfg.MODEL.DETACH_FINAL_FC
+                or cfg.MODEL.MODEL_NAME == "ContrastiveModel"
+                else False
+            ),
+        )
+        if cfg.MODEL.FP16_ALLREDUCE:
+            model.register_comm_hook(
+                state=None, hook=comm_hooks_default.fp16_compress_hook
+            )
+    return model

From ecbf84e32cfb5980f845d235876f1adad07bb467 Mon Sep 17 00:00:00 2001
From: zhong-al <74470739+zhong-al@users.noreply.github.com>
Date: Mon, 25 Nov 2024 17:14:58 -0500
Subject: [PATCH 18/20] Add slowfast option

---
 src/kabr_tools/miniscene2behavior.py | 102 ++++++++++++++++-----------
 1 file changed, 59 insertions(+), 43 deletions(-)

diff --git a/src/kabr_tools/miniscene2behavior.py b/src/kabr_tools/miniscene2behavior.py
index 04bfabd..3ad253e 100644
--- a/src/kabr_tools/miniscene2behavior.py
+++ b/src/kabr_tools/miniscene2behavior.py
@@ -1,48 +1,15 @@
 import sys
 import argparse
+import random
 import torch
 from lxml import etree
 import numpy as np
 import pandas as pd
 import cv2
 from tqdm import tqdm
-import slowfast.utils.checkpoint as cu
-from slowfast.models import build
-from slowfast.utils import parser
-from slowfast.datasets.utils import get_sequence
-from slowfast.visualization.utils import process_cv2_inputs
-from slowfast.datasets.cv2_transform import scale
-from fvcore.common.config import CfgNode
-from torch import Tensor
-
-
-def get_input_clip(cap: cv2.VideoCapture, cfg: CfgNode, keyframe_idx: int) -> list[Tensor]:
-    # https://github.com/facebookresearch/SlowFast/blob/bac7b672f40d44166a84e8c51d1a5ba367ace816/slowfast/visualization/ava_demo_precomputed_boxes.py
-    seq_length = cfg.DATA.NUM_FRAMES * cfg.DATA.SAMPLING_RATE
-    total_frames = int(cap.get(cv2.CAP_PROP_FRAME_COUNT))
-    assert keyframe_idx < total_frames, f"keyframe_idx: {keyframe_idx}" \
-        f" >= total_frames: {total_frames}"
-    seq = get_sequence(
-        keyframe_idx,
-        seq_length // 2,
-        cfg.DATA.SAMPLING_RATE,
-        total_frames,
-    )
-
-    clip = []
-    for frame_idx in seq:
-        cap.set(cv2.CAP_PROP_POS_FRAMES, frame_idx)
-        was_read, frame = cap.read()
-        if was_read:
-            frame = cv2.cvtColor(frame, cv2.COLOR_BGR2RGB)
-            frame = scale(cfg.DATA.TEST_CROP_SIZE, frame)
-            clip.append(frame)
-        else:
-            print("Unable to read frame. Duplicating previous frame.")
-            clip.append(clip[-1])
-
-    clip = process_cv2_inputs(clip, cfg)
-    return clip
+from kabr_tools.utils.slowfast.utils import get_input_clip
+from kabr_tools.utils.slowfast.cfg import load_config, CfgNode
+from kabr_tools.utils.slowfast.x3d import build_model
 
 
 def parse_args() -> argparse.Namespace:
@@ -83,29 +50,43 @@ def parse_args() -> argparse.Namespace:
         help="filepath for output csv",
         default="annotation_data.csv"
     )
+    local_parser.add_argument(
+        "--slowfast",
+        action="store_true",
+        help="load slowfast model"
+    )
 
     return local_parser.parse_args()
 
 
-def create_model(config_path: str, checkpoint_path: str, gpu_num: int) -> tuple[CfgNode, torch.nn.Module]:
+def set_seeds(seed):
+    np.random.seed(seed)
+    random.seed(seed)
+    torch.manual_seed(seed)
+
+
+def create_slowfast(config_path: str, checkpoint_path: str, gpu_num: int) -> tuple[CfgNode, torch.nn.Module]:
+    import slowfast.utils.checkpoint as cu
+    from slowfast.models import build
+    from slowfast.utils import parser
+
     # load model config
     try:
         cfg = parser.load_config(parser.parse_args(), config_path)
     except FileNotFoundError:
         checkpoint = torch.load(
             checkpoint_path, map_location=torch.device("cpu"))
-        with open(config_path, "w") as file:
+        with open(config_path, "w", encoding="utf-8") as file:
             file.write(checkpoint["cfg"])
         cfg = parser.load_config(parser.parse_args(), config_path)
     cfg.NUM_GPUS = gpu_num
     cfg.OUTPUT_DIR = ""
-    model = build.build_model(cfg)
 
     # set random seeds
-    np.random.seed(cfg.RNG_SEED)
-    torch.manual_seed(cfg.RNG_SEED)
+    set_seeds(cfg.RNG_SEED)
 
     # load model checkpoint
+    model = build.build_model(cfg)
     cu.load_checkpoint(checkpoint_path, model, data_parallel=False)
 
     # set model to eval mode
@@ -113,6 +94,33 @@ def create_model(config_path: str, checkpoint_path: str, gpu_num: int) -> tuple[
     return cfg, model
 
 
+def create_model(config_path: str, checkpoint_path: str, gpu_num: int) -> tuple[CfgNode, torch.nn.Module]:
+    # load model checkpoint
+    checkpoint = torch.load(checkpoint_path, weights_only=True,
+                            map_location=torch.device("cpu"))
+
+    # load model config
+    try:
+        cfg = load_config(config_path)
+    except FileNotFoundError:
+        with open(config_path, "w", encoding="utf-8") as file:
+            file.write(checkpoint["cfg"])
+        cfg = load_config(config_path)
+    cfg.NUM_GPUS = gpu_num
+    cfg.OUTPUT_DIR = ""
+
+    # set random seeds
+    set_seeds(cfg.RNG_SEED)
+
+    # load model
+    model = build_model(cfg)
+    model.load_state_dict(checkpoint["model_state"])
+
+    # set model to eval mode
+    model.eval()
+    return cfg, model
+
+
 def annotate_miniscene(cfg: CfgNode, model: torch.nn.Module,
                        miniscene_path: str, video: str,
                        output_path: str) -> None:
@@ -174,7 +182,15 @@ def main() -> None:
     # clear arguments to avoid slowfast parsing issues
     args = parse_args()
     sys.argv = [sys.argv[0]]
-    cfg, model = create_model(args.config, args.checkpoint, args.gpu_num)
+
+    # load model
+    if not args.slowfast:
+        cfg, model = create_model(args.config, args.checkpoint, args.gpu_num)
+    else:
+        cfg, model = create_slowfast(
+            args.config, args.checkpoint, args.gpu_num)
+
+    # annotate
     annotate_miniscene(cfg, model, args.miniscene,
                        args.video, args.output)
 

From b55b7adabc17a7330a692076a4441b1190c7f331 Mon Sep 17 00:00:00 2001
From: zhong-al <74470739+zhong-al@users.noreply.github.com>
Date: Mon, 25 Nov 2024 17:20:17 -0500
Subject: [PATCH 19/20] Make detectron2 and slowfast optional

---
 pyproject.toml | 10 +++++++---
 1 file changed, 7 insertions(+), 3 deletions(-)

diff --git a/pyproject.toml b/pyproject.toml
index 6fb4d73..4d4a08a 100644
--- a/pyproject.toml
+++ b/pyproject.toml
@@ -35,9 +35,7 @@ dependencies = [
     "ruamel.yaml",
     "ultralytics",
     "pandas",
-    "detectron2 @ git+https://github.com/facebookresearch/detectron2.git@2a420edb307c9bdf640f036d3b196bed474b8593",
-    "pytorchvideo @ git+https://github.com/facebookresearch/pytorchvideo.git@1fadaef40dd393ca09680f55582399f4679fc9b7",
-    "slowfast @ git+https://github.com/Imageomics/SlowFast@797a6f3ae81c49019d006296f1e0f84f431dc356"
+    "pytorchvideo @ git+https://github.com/facebookresearch/pytorchvideo.git@1fadaef40dd393ca09680f55582399f4679fc9b7"
 ]
 keywords = [
     "annotation",
@@ -68,3 +66,9 @@ miniscene2behavior = "kabr_tools.miniscene2behavior:main"
 
 [tool.hatch.version]
 path = "src/kabr_tools/__about__.py"
+
+[project.optional-dependencies]
+slowfast = [
+    "detectron2 @ git+https://github.com/facebookresearch/detectron2.git@2a420edb307c9bdf640f036d3b196bed474b8593",
+    "slowfast @ git+https://github.com/Imageomics/SlowFast@797a6f3ae81c49019d006296f1e0f84f431dc356"
+]

From 18129d2b90ce15c4ea15161cc7b2cef922a08a94 Mon Sep 17 00:00:00 2001
From: zhong-al <74470739+zhong-al@users.noreply.github.com>
Date: Tue, 10 Dec 2024 17:47:55 -0500
Subject: [PATCH 20/20] Update patch

---
 tests/test_miniscene2behavior.py | 8 ++++----
 1 file changed, 4 insertions(+), 4 deletions(-)

diff --git a/tests/test_miniscene2behavior.py b/tests/test_miniscene2behavior.py
index 7875e2d..2f650cf 100644
--- a/tests/test_miniscene2behavior.py
+++ b/tests/test_miniscene2behavior.py
@@ -94,9 +94,9 @@ def test_run(self):
                     "--video", self.video]
         run()
 
-    @patch('kabr_tools.miniscene2behavior.process_cv2_inputs')
+    @patch('kabr_tools.miniscene2behavior.get_input_clip')
     @patch('kabr_tools.miniscene2behavior.cv2.VideoCapture')
-    def test_matching_tracks(self, video_capture, process_cv2_inputs):
+    def test_matching_tracks(self, video_capture, get_input_clip):
 
         # Create fake model that always returns a prediction of 1
         mock_model = Mock()
@@ -131,9 +131,9 @@ def test_matching_tracks(self, video_capture, process_cv2_inputs):
                          "video", "track", "frame", "label"])
         self.assertGreater(len(df.index), 0)
 
-    @patch('kabr_tools.miniscene2behavior.process_cv2_inputs')
+    @patch('kabr_tools.miniscene2behavior.get_input_clip')
     @patch('kabr_tools.miniscene2behavior.cv2.VideoCapture')
-    def test_nonmatching_tracks(self, video_capture, process_cv2_inputs):
+    def test_nonmatching_tracks(self, video_capture, get_input_clip):
 
         # Create fake model that always returns a prediction of 1
         mock_model = Mock()