From 08b32428f39831a2fdfdfa1638e937f4f2568bd3 Mon Sep 17 00:00:00 2001 From: zhong-al <74470739+zhong-al@users.noreply.github.com> Date: Tue, 15 Oct 2024 21:09:24 -0400 Subject: [PATCH 01/20] Type annotate miniscene2behavior --- src/kabr_tools/miniscene2behavior.py | 76 +++++++++++++++------------- 1 file changed, 40 insertions(+), 36 deletions(-) diff --git a/src/kabr_tools/miniscene2behavior.py b/src/kabr_tools/miniscene2behavior.py index 022ffce..6b000a9 100644 --- a/src/kabr_tools/miniscene2behavior.py +++ b/src/kabr_tools/miniscene2behavior.py @@ -1,19 +1,21 @@ import sys +import argparse import torch from lxml import etree import pandas as pd import cv2 -import argparse from tqdm import tqdm import slowfast.utils.checkpoint as cu -import slowfast.models.build as build -import slowfast.utils.parser as parser +from slowfast.models import build +from slowfast.utils import parser from slowfast.datasets.utils import get_sequence from slowfast.visualization.utils import process_cv2_inputs from slowfast.datasets.cv2_transform import scale +from fvcore.common.config import CfgNode +from torch import Tensor -def get_input_clip(cap, cfg, keyframe_idx): +def get_input_clip(cap: cv2.VideoCapture, cfg: CfgNode, keyframe_idx: int) -> list[Tensor]: # https://github.com/facebookresearch/SlowFast/blob/bac7b672f40d44166a84e8c51d1a5ba367ace816/slowfast/visualization/ava_demo_precomputed_boxes.py seq_length = cfg.DATA.NUM_FRAMES * cfg.DATA.SAMPLING_RATE total_frames = int(cap.get(cv2.CAP_PROP_FRAME_COUNT)) @@ -32,7 +34,7 @@ def get_input_clip(cap, cfg, keyframe_idx): frame = scale(cfg.DATA.TEST_CROP_SIZE, frame) clip.append(frame) else: - print('Unable to read frame. Duplicating previous frame.') + print("Unable to read frame. Duplicating previous frame.") clip.append(clip[-1]) clip = process_cv2_inputs(clip, cfg) @@ -42,57 +44,57 @@ def get_input_clip(cap, cfg, keyframe_idx): def parse_args(): local_parser = argparse.ArgumentParser() local_parser.add_argument( - '--config', + "--config", type=str, - help='model config.yml filepath', - default='config.yml' + help="model config.yml filepath", + default="config.yml" ) local_parser.add_argument( - '--checkpoint', + "--checkpoint", type=str, - help='model checkpoint.pyth filepath', + help="model checkpoint.pyth filepath", required=True ) local_parser.add_argument( - '--gpu_num', + "--gpu_num", type=int, - help='number of gpus', + help="number of gpus", default=0 ) local_parser.add_argument( - '--miniscene', + "--miniscene", type=str, - help='miniscene folder containing miniscene\'s tracks.xml & *.mp4', + help="miniscene folder containing miniscene\'s tracks.xml & *.mp4", required=True ) local_parser.add_argument( - '--video', + "--video", type=str, - help='name of video (expect video_tracks.xml from tracks_extractor)', + help="name of video (expect video_tracks.xml from tracks_extractor)", required=True ) local_parser.add_argument( - '--output', + "--output", type=str, - help='filepath for output csv', - default='annotation_data.csv' + help="filepath for output csv", + default="annotation_data.csv" ) return local_parser.parse_args() -def create_model(config_path, checkpoint_path, gpu_num): +def create_model(config_path: str, checkpoint_path: str, gpu_num: int) -> tuple[CfgNode, torch.nn.Module]: # load model config try: cfg = parser.load_config(parser.parse_args(), config_path) except FileNotFoundError: checkpoint = torch.load( - checkpoint_path, map_location=torch.device('cpu')) - with open(config_path, 'w') as file: - file.write(checkpoint['cfg']) + checkpoint_path, map_location=torch.device("cpu")) + with open(config_path, "w") as file: + file.write(checkpoint["cfg"]) cfg = parser.load_config(parser.parse_args(), config_path) cfg.NUM_GPUS = gpu_num - cfg.OUTPUT_DIR = '' + cfg.OUTPUT_DIR = "" model = build.build_model(cfg) # load model checkpoint @@ -103,9 +105,9 @@ def create_model(config_path, checkpoint_path, gpu_num): return cfg, model -def annotate_miniscene(cfg, model, miniscene_path, video, output_path): +def annotate_miniscene(cfg: CfgNode, model: torch.nn.Module, miniscene_path: str, video: str, output_path: str) -> None: label_data = [] - track_file = f'{miniscene_path}/metadata/{video}_tracks.xml' + track_file = f"{miniscene_path}/metadata/{video}_tracks.xml" root = etree.parse(track_file).getroot() # find all tracks @@ -115,15 +117,17 @@ def annotate_miniscene(cfg, model, miniscene_path, video, output_path): tracks.append(track_id) # find all frames + assert len(tracks) > 0, "No tracks found in track file" + track = tracks[-1] frames = [] for box in track.iterfind("box"): - frames.append(int(box.attrib['frame'])) + frames.append(int(box.attrib["frame"])) # run model on miniscene for track in tracks: video_file = f"{miniscene_path}/{track}.mp4" cap = cv2.VideoCapture(video_file) - for frame in tqdm(frames, desc=f'{track} frames'): + for frame in tqdm(frames, desc=f"{track} frames"): inputs = get_input_clip(cap, cfg, frame) if cfg.NUM_GPUS: @@ -140,17 +144,17 @@ def annotate_miniscene(cfg, model, miniscene_path, video, output_path): if cfg.NUM_GPUS: preds = preds.cpu() - label_data.append({'video': video, - 'track': track, - 'frame': frame, - 'label': torch.argmax(preds).item()}) + label_data.append({"video": video, + "track": track, + "frame": frame, + "label": torch.argmax(preds).item()}) if frame % 20 == 0: pd.DataFrame(label_data).to_csv( - output_path, sep=' ', index=False) - pd.DataFrame(label_data).to_csv(output_path, sep=' ', index=False) + output_path, sep=" ", index=False) + pd.DataFrame(label_data).to_csv(output_path, sep=" ", index=False) -def main(): +def main() -> None: # clear arguments to avoid slowfast parsing issues args = parse_args() sys.argv = [sys.argv[0]] @@ -159,5 +163,5 @@ def main(): args.video, args.output) -if __name__ == '__main__': +if __name__ == "__main__": main() From d747a3ee1b77d5563120e402d636d65ba188f610 Mon Sep 17 00:00:00 2001 From: zhong-al <74470739+zhong-al@users.noreply.github.com> Date: Tue, 15 Oct 2024 21:10:29 -0400 Subject: [PATCH 02/20] Type annotate parse_args --- src/kabr_tools/miniscene2behavior.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/kabr_tools/miniscene2behavior.py b/src/kabr_tools/miniscene2behavior.py index 6b000a9..5a1a87e 100644 --- a/src/kabr_tools/miniscene2behavior.py +++ b/src/kabr_tools/miniscene2behavior.py @@ -41,7 +41,7 @@ def get_input_clip(cap: cv2.VideoCapture, cfg: CfgNode, keyframe_idx: int) -> li return clip -def parse_args(): +def parse_args() -> argparse.Namespace: local_parser = argparse.ArgumentParser() local_parser.add_argument( "--config", From dc9d6ae396ec168baee716cce6c11eb125ccfb84 Mon Sep 17 00:00:00 2001 From: zhong-al <74470739+zhong-al@users.noreply.github.com> Date: Tue, 15 Oct 2024 21:19:49 -0400 Subject: [PATCH 03/20] Annotate tracks extractor --- src/kabr_tools/tracks_extractor.py | 30 +++++++++++++++--------------- 1 file changed, 15 insertions(+), 15 deletions(-) diff --git a/src/kabr_tools/tracks_extractor.py b/src/kabr_tools/tracks_extractor.py index 4beff99..6fbab25 100644 --- a/src/kabr_tools/tracks_extractor.py +++ b/src/kabr_tools/tracks_extractor.py @@ -15,7 +15,7 @@ from tqdm import tqdm -def generate_timeline_image(name, folder, timeline, annotated_size): +def generate_timeline_image(name: str, folder: str, timeline: OrderedDict, annotated_size: int) -> None: timeline_image = np.zeros(shape=(len(timeline["tracks"].keys()) * 100, annotated_size, 3), dtype=np.uint8) for i, (key, value) in enumerate(timeline["tracks"].items()): @@ -47,7 +47,7 @@ def generate_timeline_image(name, folder, timeline, annotated_size): cv2.imwrite(f"mini-scenes/{folder}/metadata/{name}.jpg", timeline_resized) -def extract(video_path, annotation_path, tracking, show): +def extract(video_path: str, annotation_path: str, tracking: bool, show: bool) -> None: # Parse CVAT for video 1.1 annotation file. root = etree.parse(annotation_path).getroot() annotated = dict() @@ -180,7 +180,7 @@ def extract(video_path, annotation_path, tracking, show): vw.release() cv2.destroyAllWindows() -def tracks_extractor(video, annotation, tracking, show): +def tracks_extractor(video: str, annotation: str, tracking: bool, show: bool) -> None: if os.path.isdir(annotation): videos = [] annotations = [] @@ -208,34 +208,34 @@ def tracks_extractor(video, annotation, tracking, show): extract(video, annotation, tracking, show) -def parse_args(): +def parse_args() -> argparse.Namespace: local_parser = argparse.ArgumentParser() local_parser.add_argument( - '--video', + "--video", type=str, - help='path to folder containing videos', + help="path to folder containing videos", required=True ) local_parser.add_argument( - '--annotation', + "--annotation", type=str, - help='path to folder containing annotations', + help="path to folder containing annotations", required=True ) local_parser.add_argument( - '--tracking', - action='store_true', - help='Flag to use external tracker instead of CVAT tracks' + "--tracking", + action="store_true", + help="Flag to use external tracker instead of CVAT tracks" ) local_parser.add_argument( - '--imshow', - action='store_true', - help='Flag to display tracks\' visualization' + "--imshow", + action="store_true", + help="Flag to display tracks\' visualization" ) return local_parser.parse_args() -def main(): +def main() -> None: args = parse_args() tracks_extractor(args.video, args.annotation, args.tracking, args.imshow) From 952cc404469bae94b03fe8930c919464f78349cf Mon Sep 17 00:00:00 2001 From: zhong-al <74470739+zhong-al@users.noreply.github.com> Date: Tue, 15 Oct 2024 21:31:17 -0400 Subject: [PATCH 04/20] Fix miniscene2behavior --- src/kabr_tools/miniscene2behavior.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/kabr_tools/miniscene2behavior.py b/src/kabr_tools/miniscene2behavior.py index 5a1a87e..d9c93dc 100644 --- a/src/kabr_tools/miniscene2behavior.py +++ b/src/kabr_tools/miniscene2behavior.py @@ -117,8 +117,8 @@ def annotate_miniscene(cfg: CfgNode, model: torch.nn.Module, miniscene_path: str tracks.append(track_id) # find all frames + # TODO: rewrite - some tracks may have different frames assert len(tracks) > 0, "No tracks found in track file" - track = tracks[-1] frames = [] for box in track.iterfind("box"): frames.append(int(box.attrib["frame"])) From b78b85f8cd3f9b4d95e79d3ca200b80f82b8dfa9 Mon Sep 17 00:00:00 2001 From: zhong-al <74470739+zhong-al@users.noreply.github.com> Date: Tue, 15 Oct 2024 21:33:54 -0400 Subject: [PATCH 05/20] Type annotate cvat2slowfast --- src/kabr_tools/cvat2slowfast.py | 9 ++++----- 1 file changed, 4 insertions(+), 5 deletions(-) diff --git a/src/kabr_tools/cvat2slowfast.py b/src/kabr_tools/cvat2slowfast.py index 54cd4de..6e689ae 100644 --- a/src/kabr_tools/cvat2slowfast.py +++ b/src/kabr_tools/cvat2slowfast.py @@ -1,5 +1,6 @@ import os import sys +from typing import Optional import argparse import json from lxml import etree @@ -9,9 +10,7 @@ import cv2 -def cvat2slowfast(path_to_mini_scenes, path_to_new_dataset, label2number, old2new): - number2label = {value: key for key, value in label2number.items()} - +def cvat2slowfast(path_to_mini_scenes: str, path_to_new_dataset: str, label2number: dict, old2new: Optional[dict]) -> None: if not os.path.exists(path_to_new_dataset): os.makedirs(path_to_new_dataset) @@ -143,7 +142,7 @@ def cvat2slowfast(path_to_mini_scenes, path_to_new_dataset, label2number, old2ne f"{path_to_new_dataset}/annotation/data.csv", sep=" ", index=False) -def parse_args(): +def parse_args() -> argparse.Namespace: local_parser = argparse.ArgumentParser() local_parser.add_argument( '--miniscene', @@ -172,7 +171,7 @@ def parse_args(): return local_parser.parse_args() -def main(): +def main() -> None: args = parse_args() with open(args.classes, mode='r', encoding='utf-8') as file: From 15058418ede67e3f21bbc9f611afcb9d313124ba Mon Sep 17 00:00:00 2001 From: zhong-al <74470739+zhong-al@users.noreply.github.com> Date: Tue, 15 Oct 2024 21:39:43 -0400 Subject: [PATCH 06/20] Type annotate cvat2ultralytics --- src/kabr_tools/cvat2ultralytics.py | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/src/kabr_tools/cvat2ultralytics.py b/src/kabr_tools/cvat2ultralytics.py index bc0dd4c..5bb9bf0 100644 --- a/src/kabr_tools/cvat2ultralytics.py +++ b/src/kabr_tools/cvat2ultralytics.py @@ -1,4 +1,5 @@ import os +from typing import Optional import argparse import json import cv2 @@ -10,7 +11,7 @@ from natsort import natsorted -def cvat2ultralytics(video_path, annotation_path, dataset, skip, label2index=None): +def cvat2ultralytics(video_path: str, annotation_path: str, dataset: str, skip: int, label2index: Optional[dict] = None): # Create a YOLO dataset structure. dataset_file = f""" path: {dataset} From 5ca8c98457256e2b12d773afd12b8b11e8b739fb Mon Sep 17 00:00:00 2001 From: zhong-al <74470739+zhong-al@users.noreply.github.com> Date: Tue, 15 Oct 2024 21:45:15 -0400 Subject: [PATCH 07/20] Type annotate detector2cvat --- src/kabr_tools/cvat2slowfast.py | 3 ++- src/kabr_tools/cvat2ultralytics.py | 8 +++++--- src/kabr_tools/detector2cvat.py | 15 +++++++-------- src/kabr_tools/miniscene2behavior.py | 4 +++- 4 files changed, 17 insertions(+), 13 deletions(-) diff --git a/src/kabr_tools/cvat2slowfast.py b/src/kabr_tools/cvat2slowfast.py index 6e689ae..fbdcd92 100644 --- a/src/kabr_tools/cvat2slowfast.py +++ b/src/kabr_tools/cvat2slowfast.py @@ -10,7 +10,8 @@ import cv2 -def cvat2slowfast(path_to_mini_scenes: str, path_to_new_dataset: str, label2number: dict, old2new: Optional[dict]) -> None: +def cvat2slowfast(path_to_mini_scenes: str, path_to_new_dataset: str, + label2number: dict, old2new: Optional[dict]) -> None: if not os.path.exists(path_to_new_dataset): os.makedirs(path_to_new_dataset) diff --git a/src/kabr_tools/cvat2ultralytics.py b/src/kabr_tools/cvat2ultralytics.py index 5bb9bf0..5b757c8 100644 --- a/src/kabr_tools/cvat2ultralytics.py +++ b/src/kabr_tools/cvat2ultralytics.py @@ -11,7 +11,9 @@ from natsort import natsorted -def cvat2ultralytics(video_path: str, annotation_path: str, dataset: str, skip: int, label2index: Optional[dict] = None): +def cvat2ultralytics(video_path: str, annotation_path: str, + dataset: str, skip: int, + label2index: Optional[dict] = None) -> None: # Create a YOLO dataset structure. dataset_file = f""" path: {dataset} @@ -170,7 +172,7 @@ def cvat2ultralytics(video_path: str, annotation_path: str, dataset: str, skip: shutil.move(f"{dataset}/labels/train/{file}", f"{dataset}/labels/test/{file}") -def parse_args(): +def parse_args() -> argparse.Namespace: local_parser = argparse.ArgumentParser() local_parser.add_argument( '--video', @@ -205,7 +207,7 @@ def parse_args(): return local_parser.parse_args() -def main(): +def main() -> None: args = parse_args() if args.label2index: diff --git a/src/kabr_tools/detector2cvat.py b/src/kabr_tools/detector2cvat.py index 6a4c5b8..58b2f84 100644 --- a/src/kabr_tools/detector2cvat.py +++ b/src/kabr_tools/detector2cvat.py @@ -8,8 +8,7 @@ from kabr_tools.utils.draw import Draw - -def detector2cvat(path_to_videos, path_to_save): +def detector2cvat(path_to_videos: str, path_to_save: str) -> None: videos = [] for root, dirs, files in os.walk(path_to_videos): @@ -97,24 +96,24 @@ def detector2cvat(path_to_videos, path_to_save): print("Something went wrong...") -def parse_args(): +def parse_args() -> argparse.Namespace: local_parser = argparse.ArgumentParser() local_parser.add_argument( - '--video', + "--video", type=str, - help='path to folder containing videos', + help="path to folder containing videos", required=True ) local_parser.add_argument( - '--save', + "--save", type=str, - help='path to save output xml & mp4 files', + help="path to save output xml & mp4 files", required=True ) return local_parser.parse_args() -def main(): +def main() -> None: args = parse_args() detector2cvat(args.video, args.save) diff --git a/src/kabr_tools/miniscene2behavior.py b/src/kabr_tools/miniscene2behavior.py index d9c93dc..ad7b6ed 100644 --- a/src/kabr_tools/miniscene2behavior.py +++ b/src/kabr_tools/miniscene2behavior.py @@ -105,7 +105,9 @@ def create_model(config_path: str, checkpoint_path: str, gpu_num: int) -> tuple[ return cfg, model -def annotate_miniscene(cfg: CfgNode, model: torch.nn.Module, miniscene_path: str, video: str, output_path: str) -> None: +def annotate_miniscene(cfg: CfgNode, model: torch.nn.Module, + miniscene_path: str, video: str, + output_path: str) -> None: label_data = [] track_file = f"{miniscene_path}/metadata/{video}_tracks.xml" root = etree.parse(track_file).getroot() From a90608b61f7101ce94b373d5645fd0d64db27431 Mon Sep 17 00:00:00 2001 From: zhong-al <74470739+zhong-al@users.noreply.github.com> Date: Tue, 15 Oct 2024 21:59:05 -0400 Subject: [PATCH 08/20] Type annotate player --- src/kabr_tools/player.py | 26 +++++++++++++++----------- 1 file changed, 15 insertions(+), 11 deletions(-) diff --git a/src/kabr_tools/player.py b/src/kabr_tools/player.py index d440085..28cb460 100644 --- a/src/kabr_tools/player.py +++ b/src/kabr_tools/player.py @@ -1,12 +1,13 @@ import os import argparse import json -from lxml import etree from collections import OrderedDict +from lxml import etree import cv2 +from cv2.typing import MatLike -def on_slider_change(value): +def on_slider_change(value: int) -> None: global index, vcs, current, trackbar_position, paused, updated index = value @@ -17,7 +18,7 @@ def on_slider_change(value): updated = True -def pad(image, width, height): +def pad(image: MatLike, width: int, height: int) -> MatLike: shape_0, shape_1 = image.shape[0], image.shape[1] if shape_0 < shape_1: @@ -34,7 +35,7 @@ def pad(image, width, height): return padded -def draw_aim(current, image): +def draw_aim(current: str, image: MatLike) -> MatLike: if current == "main": return image @@ -47,7 +48,8 @@ def draw_aim(current, image): return cv2.addWeighted(image, 0.4, copied, 0.6, 0.0) -def draw_id(current, image, metadata, width): +def draw_id(current: str, image: MatLike, + metadata: dict, width: int) -> MatLike: if current == "main": label = f"Drone View" color = (127, 127, 127) @@ -68,7 +70,9 @@ def draw_id(current, image, metadata, width): return cv2.addWeighted(image, 0.4, copied, 0.6, 0.0) -def draw_actions(current, index, image, actions, metadata, width, height): +def draw_actions(current: str, index: int, + image: MatLike, actions: OrderedDict, + metadata: dict, width: int, height: int) -> MatLike: if current == "main": return image @@ -92,7 +96,7 @@ def draw_actions(current, index, image, actions, metadata, width, height): return cv2.addWeighted(image, 0.4, copied, 0.6, 0.0) -def draw_info(image, width): +def draw_info(image: MatLike, width: int) -> MatLike: copied = image.copy() cv2.rectangle(image, (width - 600, 100), (width - 100, 340), (0, 0, 0), -1) cv2.putText(image, "[0-9]: Show Track #[0-9]", (width - 565, 150), @@ -107,7 +111,7 @@ def draw_info(image, width): return cv2.addWeighted(image, 0.4, copied, 0.6, 0.0) -def hotkey(key): +def hotkey(key: int) -> None: global current, metadata, vc, letter2hotkey mapped = letter2hotkey[key] @@ -130,7 +134,7 @@ def hotkey(key): vc.set(cv2.CAP_PROP_POS_FRAMES, metadata["tracks"][current][index]) -def player(folder, save): +def player(folder: str, save: bool) -> None: name = folder.split("/")[-1].split('|')[-1] metadata_path = f"{folder}/metadata/{name}_metadata.json" @@ -269,7 +273,7 @@ def player(folder, save): cv2.destroyAllWindows() -def parse_args(): +def parse_args() -> argparse.Namespace: local_parser = argparse.ArgumentParser() local_parser.add_argument( '--folder', @@ -285,7 +289,7 @@ def parse_args(): return local_parser.parse_args() -def main(): +def main() -> None: args = parse_args() player(args.folder, args.save) From e53c6d8bf52822b7ef971e85ce9ac734357de684 Mon Sep 17 00:00:00 2001 From: zhong-al <74470739+zhong-al@users.noreply.github.com> Date: Tue, 15 Oct 2024 22:14:49 -0400 Subject: [PATCH 09/20] Make imshow optional, update docs --- README.md | 2 +- src/kabr_tools/detector2cvat.py | 13 ++++++++++--- src/kabr_tools/player.py | 27 +++++++++++++++++---------- src/kabr_tools/tracks_extractor.py | 4 ++-- 4 files changed, 30 insertions(+), 16 deletions(-) diff --git a/README.md b/README.md index 367f4be..559d937 100644 --- a/README.md +++ b/README.md @@ -75,7 +75,7 @@ You may use [YOLO](https://docs.ultralytics.com/) to automatically perform detec Detect objects with Ultralytics YOLO detections, apply SORT tracking and convert tracks to CVAT format. ``` -detector2cvat --video path_to_videos --save path_to_save +detector2cvat --video path_to_videos --save path_to_save [--imshow] ``` diff --git a/src/kabr_tools/detector2cvat.py b/src/kabr_tools/detector2cvat.py index 58b2f84..52df627 100644 --- a/src/kabr_tools/detector2cvat.py +++ b/src/kabr_tools/detector2cvat.py @@ -8,7 +8,7 @@ from kabr_tools.utils.draw import Draw -def detector2cvat(path_to_videos: str, path_to_save: str) -> None: +def detector2cvat(path_to_videos: str, path_to_save: str, show: bool) -> None: videos = [] for root, dirs, files in os.walk(path_to_videos): @@ -76,7 +76,9 @@ def detector2cvat(path_to_videos: str, path_to_save: str) -> None: cv2.putText(visualization, f"Frame: {index}", (50, 50), cv2.FONT_HERSHEY_SIMPLEX, 0.8, (255, 255, 255), 3, cv2.LINE_AA) - cv2.imshow("detector2cvat", cv2.resize(visualization, (int(width // 2.5), int(height // 2.5)))) + if show: + cv2.imshow("detector2cvat", cv2.resize( + visualization, (int(width // 2.5), int(height // 2.5)))) vw.write(visualization) key = cv2.waitKey(1) index += 1 @@ -110,12 +112,17 @@ def parse_args() -> argparse.Namespace: help="path to save output xml & mp4 files", required=True ) + local_parser.add_argument( + "--imshow", + action="store_true", + help="flag to display detector's visualization" + ) return local_parser.parse_args() def main() -> None: args = parse_args() - detector2cvat(args.video, args.save) + detector2cvat(args.video, args.save, args.imshow) if __name__ == "__main__": diff --git a/src/kabr_tools/player.py b/src/kabr_tools/player.py index 28cb460..6c4a83f 100644 --- a/src/kabr_tools/player.py +++ b/src/kabr_tools/player.py @@ -134,7 +134,7 @@ def hotkey(key: int) -> None: vc.set(cv2.CAP_PROP_POS_FRAMES, metadata["tracks"][current][index]) -def player(folder: str, save: bool) -> None: +def player(folder: str, save: bool, show: bool) -> None: name = folder.split("/")[-1].split('|')[-1] metadata_path = f"{folder}/metadata/{name}_metadata.json" @@ -216,9 +216,11 @@ def player(folder: str, save: bool) -> None: cv2.setTrackbarPos(name, "TrackPlayer", index) cv2.putText(visualization, f"Frame: {index}", (50, 50), cv2.FONT_HERSHEY_SIMPLEX, 0.8, (255, 255, 255), 3, cv2.LINE_AA) - - cv2.imshow("TrackPlayer", cv2.resize(visualization, (int(target_width // 2.5), int(target_height // 2.5)), - interpolation=cv2.INTER_AREA)) + if show: + cv2.imshow("TrackPlayer", + cv2.resize(visualization, + (int(target_width // 2.5), int(target_height // 2.5)), + interpolation=cv2.INTER_AREA)) if save: vw.write(visualization) @@ -276,22 +278,27 @@ def player(folder: str, save: bool) -> None: def parse_args() -> argparse.Namespace: local_parser = argparse.ArgumentParser() local_parser.add_argument( - '--folder', + "--folder", type=str, - help='path to folder with metadata and actions', + help="path to folder with metadata and actions", required=True ) local_parser.add_argument( - '--save', - action='store_true', - help='Flag to save video' + "--save", + action="store_true", + help="flag to save video" + ) + local_parser.add_argument( + "--imshow", + action="store_true", + help="flag to display detector's visualization" ) return local_parser.parse_args() def main() -> None: args = parse_args() - player(args.folder, args.save) + player(args.folder, args.save, args.imshow) if __name__ == "__main__": diff --git a/src/kabr_tools/tracks_extractor.py b/src/kabr_tools/tracks_extractor.py index 6fbab25..112ac8e 100644 --- a/src/kabr_tools/tracks_extractor.py +++ b/src/kabr_tools/tracks_extractor.py @@ -225,12 +225,12 @@ def parse_args() -> argparse.Namespace: local_parser.add_argument( "--tracking", action="store_true", - help="Flag to use external tracker instead of CVAT tracks" + help="flag to use external tracker instead of CVAT tracks" ) local_parser.add_argument( "--imshow", action="store_true", - help="Flag to display tracks\' visualization" + help="flag to display tracks\' visualization" ) return local_parser.parse_args() From 9405859b1db55d03806714b7dbafc587d84fead2 Mon Sep 17 00:00:00 2001 From: zhong-al <74470739+zhong-al@users.noreply.github.com> Date: Wed, 16 Oct 2024 11:08:02 -0400 Subject: [PATCH 10/20] Find frames per track --- src/kabr_tools/miniscene2behavior.py | 13 ++++++------- 1 file changed, 6 insertions(+), 7 deletions(-) diff --git a/src/kabr_tools/miniscene2behavior.py b/src/kabr_tools/miniscene2behavior.py index ad7b6ed..ed3f090 100644 --- a/src/kabr_tools/miniscene2behavior.py +++ b/src/kabr_tools/miniscene2behavior.py @@ -114,22 +114,21 @@ def annotate_miniscene(cfg: CfgNode, model: torch.nn.Module, # find all tracks tracks = [] + frames = {} for track in root.iterfind("track"): track_id = track.attrib["id"] tracks.append(track_id) + frames[track_id] = [] - # find all frames - # TODO: rewrite - some tracks may have different frames - assert len(tracks) > 0, "No tracks found in track file" - frames = [] - for box in track.iterfind("box"): - frames.append(int(box.attrib["frame"])) + # find all frames + for box in track.iterfind("box"): + frames[track_id].append(int(box.attrib["frame"])) # run model on miniscene for track in tracks: video_file = f"{miniscene_path}/{track}.mp4" cap = cv2.VideoCapture(video_file) - for frame in tqdm(frames, desc=f"{track} frames"): + for frame in tqdm(frames[track], desc=f"{track} frames"): inputs = get_input_clip(cap, cfg, frame) if cfg.NUM_GPUS: From d61b06abd33e5639aaf3ce0b71915fbaa501a240 Mon Sep 17 00:00:00 2001 From: zhong-al <74470739+zhong-al@users.noreply.github.com> Date: Wed, 16 Oct 2024 16:56:30 -0400 Subject: [PATCH 11/20] Account for track extraction --- src/kabr_tools/miniscene2behavior.py | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/src/kabr_tools/miniscene2behavior.py b/src/kabr_tools/miniscene2behavior.py index ed3f090..0ebb036 100644 --- a/src/kabr_tools/miniscene2behavior.py +++ b/src/kabr_tools/miniscene2behavior.py @@ -128,8 +128,9 @@ def annotate_miniscene(cfg: CfgNode, model: torch.nn.Module, for track in tracks: video_file = f"{miniscene_path}/{track}.mp4" cap = cv2.VideoCapture(video_file) + start_frame = frames[track][0] for frame in tqdm(frames[track], desc=f"{track} frames"): - inputs = get_input_clip(cap, cfg, frame) + inputs = get_input_clip(cap, cfg, frame - start_frame) if cfg.NUM_GPUS: # transfer the data to the current GPU device. From a144cb1da96971be0a2e2c68d6443f32bc9cf233 Mon Sep 17 00:00:00 2001 From: zhong-al <74470739+zhong-al@users.noreply.github.com> Date: Thu, 17 Oct 2024 17:52:01 -0400 Subject: [PATCH 12/20] Add check to miniscene2behavior --- src/kabr_tools/miniscene2behavior.py | 11 ++++++++++- 1 file changed, 10 insertions(+), 1 deletion(-) diff --git a/src/kabr_tools/miniscene2behavior.py b/src/kabr_tools/miniscene2behavior.py index 0ebb036..c012a4f 100644 --- a/src/kabr_tools/miniscene2behavior.py +++ b/src/kabr_tools/miniscene2behavior.py @@ -19,12 +19,16 @@ def get_input_clip(cap: cv2.VideoCapture, cfg: CfgNode, keyframe_idx: int) -> li # https://github.com/facebookresearch/SlowFast/blob/bac7b672f40d44166a84e8c51d1a5ba367ace816/slowfast/visualization/ava_demo_precomputed_boxes.py seq_length = cfg.DATA.NUM_FRAMES * cfg.DATA.SAMPLING_RATE total_frames = int(cap.get(cv2.CAP_PROP_FRAME_COUNT)) + assert keyframe_idx < total_frames, f"keyframe_idx: {keyframe_idx}" \ + f">= total_frames: {total_frames}" seq = get_sequence( keyframe_idx, seq_length // 2, cfg.DATA.SAMPLING_RATE, total_frames, ) + # TODO: remove after debugging + print(seq) clip = [] for frame_idx in seq: cap.set(cv2.CAP_PROP_POS_FRAMES, frame_idx) @@ -130,7 +134,11 @@ def annotate_miniscene(cfg: CfgNode, model: torch.nn.Module, cap = cv2.VideoCapture(video_file) start_frame = frames[track][0] for frame in tqdm(frames[track], desc=f"{track} frames"): - inputs = get_input_clip(cap, cfg, frame - start_frame) + try: + inputs = get_input_clip(cap, cfg, frame - start_frame) + except AssertionError as e: + print(e) + break if cfg.NUM_GPUS: # transfer the data to the current GPU device. @@ -153,6 +161,7 @@ def annotate_miniscene(cfg: CfgNode, model: torch.nn.Module, if frame % 20 == 0: pd.DataFrame(label_data).to_csv( output_path, sep=" ", index=False) + cap.release() pd.DataFrame(label_data).to_csv(output_path, sep=" ", index=False) From 2787bad23dc803246446129c9f4009b37d68a4e5 Mon Sep 17 00:00:00 2001 From: zhong-al <74470739+zhong-al@users.noreply.github.com> Date: Thu, 17 Oct 2024 19:30:07 -0400 Subject: [PATCH 13/20] Print more debug --- src/kabr_tools/miniscene2behavior.py | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/src/kabr_tools/miniscene2behavior.py b/src/kabr_tools/miniscene2behavior.py index c012a4f..6a0db17 100644 --- a/src/kabr_tools/miniscene2behavior.py +++ b/src/kabr_tools/miniscene2behavior.py @@ -28,7 +28,7 @@ def get_input_clip(cap: cv2.VideoCapture, cfg: CfgNode, keyframe_idx: int) -> li total_frames, ) # TODO: remove after debugging - print(seq) + print(keyframe_idx, seq[0], seq[-1], total_frames) clip = [] for frame_idx in seq: cap.set(cv2.CAP_PROP_POS_FRAMES, frame_idx) @@ -143,8 +143,8 @@ def annotate_miniscene(cfg: CfgNode, model: torch.nn.Module, if cfg.NUM_GPUS: # transfer the data to the current GPU device. if isinstance(inputs, (list,)): - for i in range(len(inputs)): - inputs[i] = inputs[i].cuda(non_blocking=True) + for i, input_clip in enumerate(inputs): + inputs[i] = input_clip.cuda(non_blocking=True) else: inputs = inputs.cuda(non_blocking=True) From 488e0fe4f7350e3a3ffb9888a959b13e51c4ebce Mon Sep 17 00:00:00 2001 From: zhong-al <74470739+zhong-al@users.noreply.github.com> Date: Thu, 17 Oct 2024 20:24:33 -0400 Subject: [PATCH 14/20] Use index because track frames can be noncontiguous --- src/kabr_tools/miniscene2behavior.py | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/src/kabr_tools/miniscene2behavior.py b/src/kabr_tools/miniscene2behavior.py index 6a0db17..430ecdd 100644 --- a/src/kabr_tools/miniscene2behavior.py +++ b/src/kabr_tools/miniscene2behavior.py @@ -132,10 +132,10 @@ def annotate_miniscene(cfg: CfgNode, model: torch.nn.Module, for track in tracks: video_file = f"{miniscene_path}/{track}.mp4" cap = cv2.VideoCapture(video_file) - start_frame = frames[track][0] - for frame in tqdm(frames[track], desc=f"{track} frames"): + print(f'{track=}') + for index, frame in tqdm(enumerate(frames[track]), desc=f'{track} frames'): try: - inputs = get_input_clip(cap, cfg, frame - start_frame) + inputs = get_input_clip(cap, cfg, index) except AssertionError as e: print(e) break From da5b0b8e463646b8dd7009d1058fda186885d77e Mon Sep 17 00:00:00 2001 From: zhong-al <74470739+zhong-al@users.noreply.github.com> Date: Fri, 25 Oct 2024 17:16:26 -0400 Subject: [PATCH 15/20] Fix spacing + tqdm bar --- src/kabr_tools/miniscene2behavior.py | 10 +++++----- 1 file changed, 5 insertions(+), 5 deletions(-) diff --git a/src/kabr_tools/miniscene2behavior.py b/src/kabr_tools/miniscene2behavior.py index 430ecdd..61630df 100644 --- a/src/kabr_tools/miniscene2behavior.py +++ b/src/kabr_tools/miniscene2behavior.py @@ -20,15 +20,14 @@ def get_input_clip(cap: cv2.VideoCapture, cfg: CfgNode, keyframe_idx: int) -> li seq_length = cfg.DATA.NUM_FRAMES * cfg.DATA.SAMPLING_RATE total_frames = int(cap.get(cv2.CAP_PROP_FRAME_COUNT)) assert keyframe_idx < total_frames, f"keyframe_idx: {keyframe_idx}" \ - f">= total_frames: {total_frames}" + f" >= total_frames: {total_frames}" seq = get_sequence( keyframe_idx, seq_length // 2, cfg.DATA.SAMPLING_RATE, total_frames, ) - # TODO: remove after debugging - print(keyframe_idx, seq[0], seq[-1], total_frames) + clip = [] for frame_idx in seq: cap.set(cv2.CAP_PROP_POS_FRAMES, frame_idx) @@ -132,13 +131,14 @@ def annotate_miniscene(cfg: CfgNode, model: torch.nn.Module, for track in tracks: video_file = f"{miniscene_path}/{track}.mp4" cap = cv2.VideoCapture(video_file) - print(f'{track=}') - for index, frame in tqdm(enumerate(frames[track]), desc=f'{track} frames'): + index = 0 + for frame in tqdm(frames[track], desc=f'{track} frames'): try: inputs = get_input_clip(cap, cfg, index) except AssertionError as e: print(e) break + index += 1 if cfg.NUM_GPUS: # transfer the data to the current GPU device. From 49aaa4b7908bfb6bf0f9f4c765d7147dde515702 Mon Sep 17 00:00:00 2001 From: zhong-al <74470739+zhong-al@users.noreply.github.com> Date: Fri, 25 Oct 2024 17:17:55 -0400 Subject: [PATCH 16/20] Set random seeds --- src/kabr_tools/miniscene2behavior.py | 5 +++++ 1 file changed, 5 insertions(+) diff --git a/src/kabr_tools/miniscene2behavior.py b/src/kabr_tools/miniscene2behavior.py index 61630df..04bfabd 100644 --- a/src/kabr_tools/miniscene2behavior.py +++ b/src/kabr_tools/miniscene2behavior.py @@ -2,6 +2,7 @@ import argparse import torch from lxml import etree +import numpy as np import pandas as pd import cv2 from tqdm import tqdm @@ -100,6 +101,10 @@ def create_model(config_path: str, checkpoint_path: str, gpu_num: int) -> tuple[ cfg.OUTPUT_DIR = "" model = build.build_model(cfg) + # set random seeds + np.random.seed(cfg.RNG_SEED) + torch.manual_seed(cfg.RNG_SEED) + # load model checkpoint cu.load_checkpoint(checkpoint_path, model, data_parallel=False) From e77a5d15bdc9573cbff178bdbe2d41f3ab08d2a9 Mon Sep 17 00:00:00 2001 From: zhong-al <74470739+zhong-al@users.noreply.github.com> Date: Fri, 22 Nov 2024 20:57:23 -0500 Subject: [PATCH 17/20] Add slowfast code --- src/kabr_tools/utils/slowfast/LICENSE | 203 ++++ src/kabr_tools/utils/slowfast/__init__.py | 0 src/kabr_tools/utils/slowfast/cfg.py | 1295 +++++++++++++++++++++ src/kabr_tools/utils/slowfast/head.py | 145 +++ src/kabr_tools/utils/slowfast/norm.py | 109 ++ src/kabr_tools/utils/slowfast/resnet.py | 926 +++++++++++++++ src/kabr_tools/utils/slowfast/stem.py | 321 +++++ src/kabr_tools/utils/slowfast/utils.py | 115 ++ src/kabr_tools/utils/slowfast/x3d.py | 352 ++++++ 9 files changed, 3466 insertions(+) create mode 100644 src/kabr_tools/utils/slowfast/LICENSE create mode 100644 src/kabr_tools/utils/slowfast/__init__.py create mode 100644 src/kabr_tools/utils/slowfast/cfg.py create mode 100644 src/kabr_tools/utils/slowfast/head.py create mode 100644 src/kabr_tools/utils/slowfast/norm.py create mode 100644 src/kabr_tools/utils/slowfast/resnet.py create mode 100644 src/kabr_tools/utils/slowfast/stem.py create mode 100644 src/kabr_tools/utils/slowfast/utils.py create mode 100644 src/kabr_tools/utils/slowfast/x3d.py diff --git a/src/kabr_tools/utils/slowfast/LICENSE b/src/kabr_tools/utils/slowfast/LICENSE new file mode 100644 index 0000000..32e386e --- /dev/null +++ b/src/kabr_tools/utils/slowfast/LICENSE @@ -0,0 +1,203 @@ +Apache License +Version 2.0, January 2004 +http://www.apache.org/licenses/ + +TERMS AND CONDITIONS FOR USE, REPRODUCTION, AND DISTRIBUTION + +1. Definitions. + +"License" shall mean the terms and conditions for use, reproduction, +and distribution as defined by Sections 1 through 9 of this document. + +"Licensor" shall mean the copyright owner or entity authorized by +the copyright owner that is granting the License. + +"Legal Entity" shall mean the union of the acting entity and all +other entities that control, are controlled by, or are under common +control with that entity. For the purposes of this definition, +"control" means (i) the power, direct or indirect, to cause the +direction or management of such entity, whether by contract or +otherwise, or (ii) ownership of fifty percent (50%) or more of the +outstanding shares, or (iii) beneficial ownership of such entity. + +"You" (or "Your") shall mean an individual or Legal Entity +exercising permissions granted by this License. + +"Source" form shall mean the preferred form for making modifications, +including but not limited to software source code, documentation +source, and configuration files. + +"Object" form shall mean any form resulting from mechanical +transformation or translation of a Source form, including but +not limited to compiled object code, generated documentation, +and conversions to other media types. + +"Work" shall mean the work of authorship, whether in Source or +Object form, made available under the License, as indicated by a +copyright notice that is included in or attached to the work +(an example is provided in the Appendix below). + +"Derivative Works" shall mean any work, whether in Source or Object +form, that is based on (or derived from) the Work and for which the +editorial revisions, annotations, elaborations, or other modifications +represent, as a whole, an original work of authorship. For the purposes +of this License, Derivative Works shall not include works that remain +separable from, or merely link (or bind by name) to the interfaces of, +the Work and Derivative Works thereof. + +"Contribution" shall mean any work of authorship, including +the original version of the Work and any modifications or additions +to that Work or Derivative Works thereof, that is intentionally +submitted to Licensor for inclusion in the Work by the copyright owner +or by an individual or Legal Entity authorized to submit on behalf of +the copyright owner. For the purposes of this definition, "submitted" +means any form of electronic, verbal, or written communication sent +to the Licensor or its representatives, including but not limited to +communication on electronic mailing lists, source code control systems, +and issue tracking systems that are managed by, or on behalf of, the +Licensor for the purpose of discussing and improving the Work, but +excluding communication that is conspicuously marked or otherwise +designated in writing by the copyright owner as "Not a Contribution." + +"Contributor" shall mean Licensor and any individual or Legal Entity +on behalf of whom a Contribution has been received by Licensor and +subsequently incorporated within the Work. + +2. Grant of Copyright License. Subject to the terms and conditions of +this License, each Contributor hereby grants to You a perpetual, +worldwide, non-exclusive, no-charge, royalty-free, irrevocable +copyright license to reproduce, prepare Derivative Works of, +publicly display, publicly perform, sublicense, and distribute the +Work and such Derivative Works in Source or Object form. + +3. Grant of Patent License. Subject to the terms and conditions of +this License, each Contributor hereby grants to You a perpetual, +worldwide, non-exclusive, no-charge, royalty-free, irrevocable +(except as stated in this section) patent license to make, have made, +use, offer to sell, sell, import, and otherwise transfer the Work, +where such license applies only to those patent claims licensable +by such Contributor that are necessarily infringed by their +Contribution(s) alone or by combination of their Contribution(s) +with the Work to which such Contribution(s) was submitted. If You +institute patent litigation against any entity (including a +cross-claim or counterclaim in a lawsuit) alleging that the Work +or a Contribution incorporated within the Work constitutes direct +or contributory patent infringement, then any patent licenses +granted to You under this License for that Work shall terminate +as of the date such litigation is filed. + +4. Redistribution. You may reproduce and distribute copies of the +Work or Derivative Works thereof in any medium, with or without +modifications, and in Source or Object form, provided that You +meet the following conditions: + +(a) You must give any other recipients of the Work or +Derivative Works a copy of this License; and + +(b) You must cause any modified files to carry prominent notices +stating that You changed the files; and + +(c) You must retain, in the Source form of any Derivative Works +that You distribute, all copyright, patent, trademark, and +attribution notices from the Source form of the Work, +excluding those notices that do not pertain to any part of +the Derivative Works; and + +(d) If the Work includes a "NOTICE" text file as part of its +distribution, then any Derivative Works that You distribute must +include a readable copy of the attribution notices contained +within such NOTICE file, excluding those notices that do not +pertain to any part of the Derivative Works, in at least one +of the following places: within a NOTICE text file distributed +as part of the Derivative Works; within the Source form or +documentation, if provided along with the Derivative Works; or, +within a display generated by the Derivative Works, if and +wherever such third-party notices normally appear. The contents +of the NOTICE file are for informational purposes only and +do not modify the License. You may add Your own attribution +notices within Derivative Works that You distribute, alongside +or as an addendum to the NOTICE text from the Work, provided +that such additional attribution notices cannot be construed +as modifying the License. + +You may add Your own copyright statement to Your modifications and +may provide additional or different license terms and conditions +for use, reproduction, or distribution of Your modifications, or +for any such Derivative Works as a whole, provided Your use, +reproduction, and distribution of the Work otherwise complies with +the conditions stated in this License. + +5. Submission of Contributions. Unless You explicitly state otherwise, +any Contribution intentionally submitted for inclusion in the Work +by You to the Licensor shall be under the terms and conditions of +this License, without any additional terms or conditions. +Notwithstanding the above, nothing herein shall supersede or modify +the terms of any separate license agreement you may have executed +with Licensor regarding such Contributions. + +6. Trademarks. This License does not grant permission to use the trade +names, trademarks, service marks, or product names of the Licensor, +except as required for reasonable and customary use in describing the +origin of the Work and reproducing the content of the NOTICE file. + +7. Disclaimer of Warranty. Unless required by applicable law or +agreed to in writing, Licensor provides the Work (and each +Contributor provides its Contributions) on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or +implied, including, without limitation, any warranties or conditions +of TITLE, NON-INFRINGEMENT, MERCHANTABILITY, or FITNESS FOR A +PARTICULAR PURPOSE. You are solely responsible for determining the +appropriateness of using or redistributing the Work and assume any +risks associated with Your exercise of permissions under this License. + +8. Limitation of Liability. In no event and under no legal theory, +whether in tort (including negligence), contract, or otherwise, +unless required by applicable law (such as deliberate and grossly +negligent acts) or agreed to in writing, shall any Contributor be +liable to You for damages, including any direct, indirect, special, +incidental, or consequential damages of any character arising as a +result of this License or out of the use or inability to use the +Work (including but not limited to damages for loss of goodwill, +work stoppage, computer failure or malfunction, or any and all +other commercial damages or losses), even if such Contributor +has been advised of the possibility of such damages. + +9. Accepting Warranty or Additional Liability. While redistributing +the Work or Derivative Works thereof, You may choose to offer, +and charge a fee for, acceptance of support, warranty, indemnity, +or other liability obligations and/or rights consistent with this +License. However, in accepting such obligations, You may act only +on Your own behalf and on Your sole responsibility, not on behalf +of any other Contributor, and only if You agree to indemnify, +defend, and hold each Contributor harmless for any liability +incurred by, or claims asserted against, such Contributor by reason +of your accepting any such warranty or additional liability. + +END OF TERMS AND CONDITIONS + +APPENDIX: How to apply the Apache License to your work. + +To apply the Apache License to your work, attach the following +boilerplate notice, with the fields enclosed by brackets "[]" +replaced with your own identifying information. (Don't include +the brackets!) The text should be enclosed in the appropriate +comment syntax for the file format. We also recommend that a +file or class name and description of purpose be included on the +same "printed page" as the copyright notice for easier +identification within third-party archives. + +Copyright 2019, Facebook, Inc + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + +http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. + +https://github.com/facebookresearch/SlowFast/blob/main/LICENSE \ No newline at end of file diff --git a/src/kabr_tools/utils/slowfast/__init__.py b/src/kabr_tools/utils/slowfast/__init__.py new file mode 100644 index 0000000..e69de29 diff --git a/src/kabr_tools/utils/slowfast/cfg.py b/src/kabr_tools/utils/slowfast/cfg.py new file mode 100644 index 0000000..6ef6c20 --- /dev/null +++ b/src/kabr_tools/utils/slowfast/cfg.py @@ -0,0 +1,1295 @@ +# Copyright (c) Facebook, Inc. and its affiliates. All Rights Reserved. + +"""Configs.""" +import math + +from fvcore.common.config import CfgNode + +# ----------------------------------------------------------------------------- +# Config definition +# ----------------------------------------------------------------------------- +_C = CfgNode() + +# ----------------------------------------------------------------------------- +# Contrastive Model (for MoCo, SimCLR, SwAV, BYOL) +# ----------------------------------------------------------------------------- + +_C.CONTRASTIVE = CfgNode() + +# temperature used for contrastive losses +_C.CONTRASTIVE.T = 0.07 + +# output dimension for the loss +_C.CONTRASTIVE.DIM = 128 + +# number of training samples (for kNN bank) +_C.CONTRASTIVE.LENGTH = 239975 + +# the length of MoCo's and MemBanks' queues +_C.CONTRASTIVE.QUEUE_LEN = 65536 + +# momentum for momentum encoder updates +_C.CONTRASTIVE.MOMENTUM = 0.5 + +# wether to anneal momentum to value above with cosine schedule +_C.CONTRASTIVE.MOMENTUM_ANNEALING = False + +# either memorybank, moco, simclr, byol, swav +_C.CONTRASTIVE.TYPE = "mem" + +# wether to interpolate memorybank in time +_C.CONTRASTIVE.INTERP_MEMORY = False + +# 1d or 2d (+temporal) memory +_C.CONTRASTIVE.MEM_TYPE = "1d" + +# number of classes for online kNN evaluation +_C.CONTRASTIVE.NUM_CLASSES_DOWNSTREAM = 400 + +# use an MLP projection with these num layers +_C.CONTRASTIVE.NUM_MLP_LAYERS = 1 + +# dimension of projection and predictor MLPs +_C.CONTRASTIVE.MLP_DIM = 2048 + +# use BN in projection/prediction MLP +_C.CONTRASTIVE.BN_MLP = False + +# use synchronized BN in projection/prediction MLP +_C.CONTRASTIVE.BN_SYNC_MLP = False + +# shuffle BN only locally vs. across machines +_C.CONTRASTIVE.LOCAL_SHUFFLE_BN = True + +# Wether to fill multiple clips (or just the first) into queue +_C.CONTRASTIVE.MOCO_MULTI_VIEW_QUEUE = False + +# if sampling multiple clips per vid they need to be at least min frames apart +_C.CONTRASTIVE.DELTA_CLIPS_MIN = -math.inf + +# if sampling multiple clips per vid they can be max frames apart +_C.CONTRASTIVE.DELTA_CLIPS_MAX = math.inf + +# if non empty, use predictors with depth specified +_C.CONTRASTIVE.PREDICTOR_DEPTHS = [] + +# Wether to sequentially process multiple clips (=lower mem usage) or batch them +_C.CONTRASTIVE.SEQUENTIAL = False + +# Wether to perform SimCLR loss across machines (or only locally) +_C.CONTRASTIVE.SIMCLR_DIST_ON = True + +# Length of queue used in SwAV +_C.CONTRASTIVE.SWAV_QEUE_LEN = 0 + +# Wether to run online kNN evaluation during training +_C.CONTRASTIVE.KNN_ON = True + + +# ---------------------------------------------------------------------------- # +# Batch norm options +# ---------------------------------------------------------------------------- # +_C.BN = CfgNode() + +# Precise BN stats. +_C.BN.USE_PRECISE_STATS = False + +# Number of samples use to compute precise bn. +_C.BN.NUM_BATCHES_PRECISE = 200 + +# Weight decay value that applies on BN. +_C.BN.WEIGHT_DECAY = 0.0 + +# Norm type, options include `batchnorm`, `sub_batchnorm`, `sync_batchnorm` +_C.BN.NORM_TYPE = "batchnorm" + +# Parameter for SubBatchNorm, where it splits the batch dimension into +# NUM_SPLITS splits, and run BN on each of them separately independently. +_C.BN.NUM_SPLITS = 1 + +# Parameter for NaiveSyncBatchNorm, where the stats across `NUM_SYNC_DEVICES` +# devices will be synchronized. `NUM_SYNC_DEVICES` cannot be larger than number of +# devices per machine; if global sync is desired, set `GLOBAL_SYNC`. +# By default ONLY applies to NaiveSyncBatchNorm3d; consider also setting +# CONTRASTIVE.BN_SYNC_MLP if appropriate. +_C.BN.NUM_SYNC_DEVICES = 1 + +# Parameter for NaiveSyncBatchNorm. Setting `GLOBAL_SYNC` to True synchronizes +# stats across all devices, across all machines; in this case, `NUM_SYNC_DEVICES` +# must be set to None. +# By default ONLY applies to NaiveSyncBatchNorm3d; consider also setting +# CONTRASTIVE.BN_SYNC_MLP if appropriate. +_C.BN.GLOBAL_SYNC = False + +# ---------------------------------------------------------------------------- # +# Training options. +# ---------------------------------------------------------------------------- # +_C.TRAIN = CfgNode() + +# If True Train the model, else skip training. +_C.TRAIN.ENABLE = True + +# Kill training if loss explodes over this ratio from the previous 5 measurements. +# Only enforced if > 0.0 +_C.TRAIN.KILL_LOSS_EXPLOSION_FACTOR = 0.0 + +# Dataset. +_C.TRAIN.DATASET = "kinetics" + +# Total mini-batch size. +_C.TRAIN.BATCH_SIZE = 64 + +# Evaluate model on test data every eval period epochs. +_C.TRAIN.EVAL_PERIOD = 10 + +# Save model checkpoint every checkpoint period epochs. +_C.TRAIN.CHECKPOINT_PERIOD = 10 + +# Resume training from the latest checkpoint in the output directory. +_C.TRAIN.AUTO_RESUME = True + +# Path to the checkpoint to load the initial weight. +_C.TRAIN.CHECKPOINT_FILE_PATH = "" + +# Checkpoint types include `caffe2` or `pytorch`. +_C.TRAIN.CHECKPOINT_TYPE = "pytorch" + +# If True, perform inflation when loading checkpoint. +_C.TRAIN.CHECKPOINT_INFLATE = False + +# If True, reset epochs when loading checkpoint. +_C.TRAIN.CHECKPOINT_EPOCH_RESET = False + +# If set, clear all layer names according to the pattern provided. +_C.TRAIN.CHECKPOINT_CLEAR_NAME_PATTERN = () # ("backbone.",) + +# If True, use FP16 for activations +_C.TRAIN.MIXED_PRECISION = False + +# if True, inflate some params from imagenet model. +_C.TRAIN.CHECKPOINT_IN_INIT = False + +# ---------------------------------------------------------------------------- # +# Augmentation options. +# ---------------------------------------------------------------------------- # +_C.AUG = CfgNode() + +# Whether to enable randaug. +_C.AUG.ENABLE = False + +# Number of repeated augmentations to used during training. +# If this is greater than 1, then the actual batch size is +# TRAIN.BATCH_SIZE * AUG.NUM_SAMPLE. +_C.AUG.NUM_SAMPLE = 1 + +# Not used if using randaug. +_C.AUG.COLOR_JITTER = 0.4 + +# RandAug parameters. +_C.AUG.AA_TYPE = "rand-m9-mstd0.5-inc1" + +# Interpolation method. +_C.AUG.INTERPOLATION = "bicubic" + +# Probability of random erasing. +_C.AUG.RE_PROB = 0.25 + +# Random erasing mode. +_C.AUG.RE_MODE = "pixel" + +# Random erase count. +_C.AUG.RE_COUNT = 1 + +# Do not random erase first (clean) augmentation split. +_C.AUG.RE_SPLIT = False + +# Whether to generate input mask during image processing. +_C.AUG.GEN_MASK_LOADER = False + +# If True, masking mode is "tube". Default is "cube". +_C.AUG.MASK_TUBE = False + +# If True, masking mode is "frame". Default is "cube". +_C.AUG.MASK_FRAMES = False + +# The size of generated masks. +_C.AUG.MASK_WINDOW_SIZE = [8, 7, 7] + +# The ratio of masked tokens out of all tokens. Also applies to MViT supervised training +_C.AUG.MASK_RATIO = 0.0 + +# The maximum number of a masked block. None means no maximum limit. (Used only in image MaskFeat.) +_C.AUG.MAX_MASK_PATCHES_PER_BLOCK = None + +# ---------------------------------------------------------------------------- # +# Masked pretraining visualization options. +# ---------------------------------------------------------------------------- # +_C.VIS_MASK = CfgNode() + +# Whether to do visualization. +_C.VIS_MASK.ENABLE = False + +# ---------------------------------------------------------------------------- # +# MipUp options. +# ---------------------------------------------------------------------------- # +_C.MIXUP = CfgNode() + +# Whether to use mixup. +_C.MIXUP.ENABLE = False + +# Mixup alpha. +_C.MIXUP.ALPHA = 0.8 + +# Cutmix alpha. +_C.MIXUP.CUTMIX_ALPHA = 1.0 + +# Probability of performing mixup or cutmix when either/both is enabled. +_C.MIXUP.PROB = 1.0 + +# Probability of switching to cutmix when both mixup and cutmix enabled. +_C.MIXUP.SWITCH_PROB = 0.5 + +# Label smoothing. +_C.MIXUP.LABEL_SMOOTH_VALUE = 0.1 + +# ---------------------------------------------------------------------------- # +# Testing options +# ---------------------------------------------------------------------------- # +_C.TEST = CfgNode() + +# If True test the model, else skip the testing. +_C.TEST.ENABLE = True + +# Dataset for testing. +_C.TEST.DATASET = "kinetics" + +# Total mini-batch size +_C.TEST.BATCH_SIZE = 8 + +# Path to the checkpoint to load the initial weight. +_C.TEST.CHECKPOINT_FILE_PATH = "" + +# Number of clips to sample from a video uniformly for aggregating the +# prediction results. +_C.TEST.NUM_ENSEMBLE_VIEWS = 10 + +# Number of crops to sample from a frame spatially for aggregating the +# prediction results. +_C.TEST.NUM_SPATIAL_CROPS = 3 + +# Checkpoint types include `caffe2` or `pytorch`. +_C.TEST.CHECKPOINT_TYPE = "pytorch" +# Path to saving prediction results file. +_C.TEST.SAVE_RESULTS_PATH = "" + +_C.TEST.NUM_TEMPORAL_CLIPS = [] +# ----------------------------------------------------------------------------- +# ResNet options +# ----------------------------------------------------------------------------- +_C.RESNET = CfgNode() + +# Transformation function. +_C.RESNET.TRANS_FUNC = "bottleneck_transform" + +# Number of groups. 1 for ResNet, and larger than 1 for ResNeXt). +_C.RESNET.NUM_GROUPS = 1 + +# Width of each group (64 -> ResNet; 4 -> ResNeXt). +_C.RESNET.WIDTH_PER_GROUP = 64 + +# Apply relu in a inplace manner. +_C.RESNET.INPLACE_RELU = True + +# Apply stride to 1x1 conv. +_C.RESNET.STRIDE_1X1 = False + +# If true, initialize the gamma of the final BN of each block to zero. +_C.RESNET.ZERO_INIT_FINAL_BN = False + +# If true, initialize the final conv layer of each block to zero. +_C.RESNET.ZERO_INIT_FINAL_CONV = False + +# Number of weight layers. +_C.RESNET.DEPTH = 50 + +# If the current block has more than NUM_BLOCK_TEMP_KERNEL blocks, use temporal +# kernel of 1 for the rest of the blocks. +_C.RESNET.NUM_BLOCK_TEMP_KERNEL = [[3], [4], [6], [3]] + +# Size of stride on different res stages. +_C.RESNET.SPATIAL_STRIDES = [[1], [2], [2], [2]] + +# Size of dilation on different res stages. +_C.RESNET.SPATIAL_DILATIONS = [[1], [1], [1], [1]] + +# ---------------------------------------------------------------------------- # +# X3D options +# See https://arxiv.org/abs/2004.04730 for details about X3D Networks. +# ---------------------------------------------------------------------------- # +_C.X3D = CfgNode() + +# Width expansion factor. +_C.X3D.WIDTH_FACTOR = 1.0 + +# Depth expansion factor. +_C.X3D.DEPTH_FACTOR = 1.0 + +# Bottleneck expansion factor for the 3x3x3 conv. +_C.X3D.BOTTLENECK_FACTOR = 1.0 # + +# Dimensions of the last linear layer before classificaiton. +_C.X3D.DIM_C5 = 2048 + +# Dimensions of the first 3x3 conv layer. +_C.X3D.DIM_C1 = 12 + +# Whether to scale the width of Res2, default is false. +_C.X3D.SCALE_RES2 = False + +# Whether to use a BatchNorm (BN) layer before the classifier, default is false. +_C.X3D.BN_LIN5 = False + +# Whether to use channelwise (=depthwise) convolution in the center (3x3x3) +# convolution operation of the residual blocks. +_C.X3D.CHANNELWISE_3x3x3 = True + +# ----------------------------------------------------------------------------- +# Nonlocal options +# ----------------------------------------------------------------------------- +_C.NONLOCAL = CfgNode() + +# Index of each stage and block to add nonlocal layers. +_C.NONLOCAL.LOCATION = [[[]], [[]], [[]], [[]]] + +# Number of group for nonlocal for each stage. +_C.NONLOCAL.GROUP = [[1], [1], [1], [1]] + +# Instatiation to use for non-local layer. +_C.NONLOCAL.INSTANTIATION = "dot_product" + + +# Size of pooling layers used in Non-Local. +_C.NONLOCAL.POOL = [ + # Res2 + [[1, 2, 2], [1, 2, 2]], + # Res3 + [[1, 2, 2], [1, 2, 2]], + # Res4 + [[1, 2, 2], [1, 2, 2]], + # Res5 + [[1, 2, 2], [1, 2, 2]], +] + +# ----------------------------------------------------------------------------- +# Model options +# ----------------------------------------------------------------------------- +_C.MODEL = CfgNode() + +# Model architecture. +_C.MODEL.ARCH = "slowfast" + +# Model name +_C.MODEL.MODEL_NAME = "SlowFast" + +# The number of classes to predict for the model. +_C.MODEL.NUM_CLASSES = 400 + +# Loss function. +_C.MODEL.LOSS_FUNC = "cross_entropy" + +# Model architectures that has one single pathway. +_C.MODEL.SINGLE_PATHWAY_ARCH = [ + "2d", + "c2d", + "i3d", + "slow", + "x3d", + "mvit", + "maskmvit", +] + +# Model architectures that has multiple pathways. +_C.MODEL.MULTI_PATHWAY_ARCH = ["slowfast"] + +# Dropout rate before final projection in the backbone. +_C.MODEL.DROPOUT_RATE = 0.5 + +# Randomly drop rate for Res-blocks, linearly increase from res2 to res5 +_C.MODEL.DROPCONNECT_RATE = 0.0 + +# The std to initialize the fc layer(s). +_C.MODEL.FC_INIT_STD = 0.01 + +# Activation layer for the output head. +_C.MODEL.HEAD_ACT = "softmax" + +# Activation checkpointing enabled or not to save GPU memory. +_C.MODEL.ACT_CHECKPOINT = False + +# If True, detach the final fc layer from the network, by doing so, only the +# final fc layer will be trained. +_C.MODEL.DETACH_FINAL_FC = False + +# If True, frozen batch norm stats during training. +_C.MODEL.FROZEN_BN = False + +# If True, AllReduce gradients are compressed to fp16 +_C.MODEL.FP16_ALLREDUCE = False + + +# ----------------------------------------------------------------------------- +# MViT options +# ----------------------------------------------------------------------------- +_C.MVIT = CfgNode() + +# Options include `conv`, `max`. +_C.MVIT.MODE = "conv" + +# If True, perform pool before projection in attention. +_C.MVIT.POOL_FIRST = False + +# If True, use cls embed in the network, otherwise don't use cls_embed in transformer. +_C.MVIT.CLS_EMBED_ON = True + +# Kernel size for patchtification. +_C.MVIT.PATCH_KERNEL = [3, 7, 7] + +# Stride size for patchtification. +_C.MVIT.PATCH_STRIDE = [2, 4, 4] + +# Padding size for patchtification. +_C.MVIT.PATCH_PADDING = [2, 4, 4] + +# If True, use 2d patch, otherwise use 3d patch. +_C.MVIT.PATCH_2D = False + +# Base embedding dimension for the transformer. +_C.MVIT.EMBED_DIM = 96 + +# Base num of heads for the transformer. +_C.MVIT.NUM_HEADS = 1 + +# Dimension reduction ratio for the MLP layers. +_C.MVIT.MLP_RATIO = 4.0 + +# If use, use bias term in attention fc layers. +_C.MVIT.QKV_BIAS = True + +# Drop path rate for the tranfomer. +_C.MVIT.DROPPATH_RATE = 0.1 + +# The initial value of layer scale gamma. Set 0.0 to disable layer scale. +_C.MVIT.LAYER_SCALE_INIT_VALUE = 0.0 + +# Depth of the transformer. +_C.MVIT.DEPTH = 16 + +# Normalization layer for the transformer. Only layernorm is supported now. +_C.MVIT.NORM = "layernorm" + +# Dimension multiplication at layer i. If 2.0 is used, then the next block will increase +# the dimension by 2 times. Format: [depth_i: mul_dim_ratio] +_C.MVIT.DIM_MUL = [] + +# Head number multiplication at layer i. If 2.0 is used, then the next block will +# increase the number of heads by 2 times. Format: [depth_i: head_mul_ratio] +_C.MVIT.HEAD_MUL = [] + +# Stride size for the Pool KV at layer i. +# Format: [[i, stride_t_i, stride_h_i, stride_w_i], ...,] +_C.MVIT.POOL_KV_STRIDE = [] + +# Initial stride size for KV at layer 1. The stride size will be further reduced with +# the raio of MVIT.DIM_MUL. If will overwrite MVIT.POOL_KV_STRIDE if not None. +_C.MVIT.POOL_KV_STRIDE_ADAPTIVE = None + +# Stride size for the Pool Q at layer i. +# Format: [[i, stride_t_i, stride_h_i, stride_w_i], ...,] +_C.MVIT.POOL_Q_STRIDE = [] + +# If not None, overwrite the KV_KERNEL and Q_KERNEL size with POOL_KVQ_CONV_SIZ. +# Otherwise the kernel_size is [s + 1 if s > 1 else s for s in stride_size]. +_C.MVIT.POOL_KVQ_KERNEL = None + +# If True, perform no decay on positional embedding and cls embedding. +_C.MVIT.ZERO_DECAY_POS_CLS = True + +# If True, use norm after stem. +_C.MVIT.NORM_STEM = False + +# If True, perform separate positional embedding. +_C.MVIT.SEP_POS_EMBED = False + +# Dropout rate for the MViT backbone. +_C.MVIT.DROPOUT_RATE = 0.0 + +# If True, use absolute positional embedding. +_C.MVIT.USE_ABS_POS = True + +# If True, use relative positional embedding for spatial dimentions +_C.MVIT.REL_POS_SPATIAL = False + +# If True, use relative positional embedding for temporal dimentions +_C.MVIT.REL_POS_TEMPORAL = False + +# If True, init rel with zero +_C.MVIT.REL_POS_ZERO_INIT = False + +# If True, using Residual Pooling connection +_C.MVIT.RESIDUAL_POOLING = False + +# Dim mul in qkv linear layers of attention block instead of MLP +_C.MVIT.DIM_MUL_IN_ATT = False + +# If True, using separate linear layers for Q, K, V in attention blocks. +_C.MVIT.SEPARATE_QKV = False + +# The initialization scale factor for the head parameters. +_C.MVIT.HEAD_INIT_SCALE = 1.0 + +# Whether to use the mean pooling of all patch tokens as the output. +_C.MVIT.USE_MEAN_POOLING = False + +# If True, use frozen sin cos positional embedding. +_C.MVIT.USE_FIXED_SINCOS_POS = False + +# ----------------------------------------------------------------------------- +# Masked pretraining options +# ----------------------------------------------------------------------------- +_C.MASK = CfgNode() + +# Whether to enable Masked style pretraining. +_C.MASK.ENABLE = False + +# Whether to enable MAE (discard encoder tokens). +_C.MASK.MAE_ON = False + +# Whether to enable random masking in mae +_C.MASK.MAE_RND_MASK = False + +# Whether to do random masking per-frame in mae +_C.MASK.PER_FRAME_MASKING = False + +# only predict loss on temporal strided patches, or predict full time extent +_C.MASK.TIME_STRIDE_LOSS = True + +# Whether to normalize the pred pixel loss +_C.MASK.NORM_PRED_PIXEL = True + +# Whether to fix initialization with inverse depth of layer for pretraining. +_C.MASK.SCALE_INIT_BY_DEPTH = False + +# Base embedding dimension for the decoder transformer. +_C.MASK.DECODER_EMBED_DIM = 512 + +# Base embedding dimension for the decoder transformer. +_C.MASK.DECODER_SEP_POS_EMBED = False + +# Use a KV kernel in decoder? +_C.MASK.DEC_KV_KERNEL = [] + +# Use a KV stride in decoder? +_C.MASK.DEC_KV_STRIDE = [] + +# The depths of features which are inputs of the prediction head. +_C.MASK.PRETRAIN_DEPTH = [15] + +# The type of Masked pretraining prediction head. +# Can be "separate", "separate_xformer". +_C.MASK.HEAD_TYPE = "separate" + +# The depth of MAE's decoder +_C.MASK.DECODER_DEPTH = 0 + +# The weight of HOG target loss. +_C.MASK.PRED_HOG = False +# Reversible Configs +_C.MVIT.REV = CfgNode() + +# Enable Reversible Model +_C.MVIT.REV.ENABLE = False + +# Method to fuse the reversible paths +# see :class: `TwoStreamFusion` for all the options +_C.MVIT.REV.RESPATH_FUSE = "concat" + +# Layers to buffer activations at +# (at least Q-pooling layers needed) +_C.MVIT.REV.BUFFER_LAYERS = [] + +# 'conv' or 'max' operator for the respath in Qpooling +_C.MVIT.REV.RES_PATH = "conv" + +# Method to merge hidden states before Qpoolinglayers +_C.MVIT.REV.PRE_Q_FUSION = "avg" + +# ----------------------------------------------------------------------------- +# SlowFast options +# ----------------------------------------------------------------------------- +_C.SLOWFAST = CfgNode() + +# Corresponds to the inverse of the channel reduction ratio, $\beta$ between +# the Slow and Fast pathways. +_C.SLOWFAST.BETA_INV = 8 + +# Corresponds to the frame rate reduction ratio, $\alpha$ between the Slow and +# Fast pathways. +_C.SLOWFAST.ALPHA = 8 + +# Ratio of channel dimensions between the Slow and Fast pathways. +_C.SLOWFAST.FUSION_CONV_CHANNEL_RATIO = 2 + +# Kernel dimension used for fusing information from Fast pathway to Slow +# pathway. +_C.SLOWFAST.FUSION_KERNEL_SZ = 5 + + +# ----------------------------------------------------------------------------- +# Data options +# ----------------------------------------------------------------------------- +_C.DATA = CfgNode() + +# The path to the data directory. +_C.DATA.PATH_TO_DATA_DIR = "" + +# The separator used between path and label. +_C.DATA.PATH_LABEL_SEPARATOR = " " + +# Video path prefix if any. +_C.DATA.PATH_PREFIX = "" + +# The number of frames of the input clip. +_C.DATA.NUM_FRAMES = 8 + +# The video sampling rate of the input clip. +_C.DATA.SAMPLING_RATE = 8 + +# Eigenvalues for PCA jittering. Note PCA is RGB based. +_C.DATA.TRAIN_PCA_EIGVAL = [0.225, 0.224, 0.229] + +# Eigenvectors for PCA jittering. +_C.DATA.TRAIN_PCA_EIGVEC = [ + [-0.5675, 0.7192, 0.4009], + [-0.5808, -0.0045, -0.8140], + [-0.5836, -0.6948, 0.4203], +] + +# If a imdb have been dumpped to a local file with the following format: +# `{"im_path": im_path, "class": cont_id}` +# then we can skip the construction of imdb and load it from the local file. +_C.DATA.PATH_TO_PRELOAD_IMDB = "" + +# The mean value of the video raw pixels across the R G B channels. +_C.DATA.MEAN = [0.45, 0.45, 0.45] +# List of input frame channel dimensions. + +_C.DATA.INPUT_CHANNEL_NUM = [3, 3] + +# The std value of the video raw pixels across the R G B channels. +_C.DATA.STD = [0.225, 0.225, 0.225] + +# The spatial augmentation jitter scales for training. +_C.DATA.TRAIN_JITTER_SCALES = [256, 320] + +# The relative scale range of Inception-style area based random resizing augmentation. +# If this is provided, DATA.TRAIN_JITTER_SCALES above is ignored. +_C.DATA.TRAIN_JITTER_SCALES_RELATIVE = [] + +# The relative aspect ratio range of Inception-style area based random resizing +# augmentation. +_C.DATA.TRAIN_JITTER_ASPECT_RELATIVE = [] + +# If True, perform stride length uniform temporal sampling. +_C.DATA.USE_OFFSET_SAMPLING = False + +# Whether to apply motion shift for augmentation. +_C.DATA.TRAIN_JITTER_MOTION_SHIFT = False + +# The spatial crop size for training. +_C.DATA.TRAIN_CROP_SIZE = 224 + +# The spatial crop size for testing. +_C.DATA.TEST_CROP_SIZE = 256 + +# Input videos may has different fps, convert it to the target video fps before +# frame sampling. +_C.DATA.TARGET_FPS = 30 + +# JITTER TARGET_FPS by +- this number randomly +_C.DATA.TRAIN_JITTER_FPS = 0.0 + +# Decoding backend, options include `pyav` or `torchvision` +_C.DATA.DECODING_BACKEND = "torchvision" + +# Decoding resize to short size (set to native size for best speed) +_C.DATA.DECODING_SHORT_SIZE = 256 + +# if True, sample uniformly in [1 / max_scale, 1 / min_scale] and take a +# reciprocal to get the scale. If False, take a uniform sample from +# [min_scale, max_scale]. +_C.DATA.INV_UNIFORM_SAMPLE = False + +# If True, perform random horizontal flip on the video frames during training. +_C.DATA.RANDOM_FLIP = True + +# If True, calculdate the map as metric. +_C.DATA.MULTI_LABEL = False + +# Method to perform the ensemble, options include "sum" and "max". +_C.DATA.ENSEMBLE_METHOD = "sum" + +# If True, revert the default input channel (RBG <-> BGR). +_C.DATA.REVERSE_INPUT_CHANNEL = False + +# how many samples (=clips) to decode from a single video +_C.DATA.TRAIN_CROP_NUM_TEMPORAL = 1 + +# how many spatial samples to crop from a single clip +_C.DATA.TRAIN_CROP_NUM_SPATIAL = 1 + +# color random percentage for grayscale conversion +_C.DATA.COLOR_RND_GRAYSCALE = 0.0 + +# loader can read .csv file in chunks of this chunk size +_C.DATA.LOADER_CHUNK_SIZE = 0 + +# if LOADER_CHUNK_SIZE > 0, define overall length of .csv file +_C.DATA.LOADER_CHUNK_OVERALL_SIZE = 0 + +# for chunked reading, dataloader can skip rows in (large) +# training csv file +_C.DATA.SKIP_ROWS = 0 + +# The separator used between path and label. +_C.DATA.PATH_LABEL_SEPARATOR = " " + +# augmentation probability to convert raw decoded video to +# grayscale temporal difference +_C.DATA.TIME_DIFF_PROB = 0.0 + +# Apply SSL-based SimCLR / MoCo v1/v2 color augmentations, +# with params below +_C.DATA.SSL_COLOR_JITTER = False + +# color jitter percentage for brightness, contrast, saturation +_C.DATA.SSL_COLOR_BRI_CON_SAT = [0.4, 0.4, 0.4] + +# color jitter percentage for hue +_C.DATA.SSL_COLOR_HUE = 0.1 + +# SimCLR / MoCo v2 augmentations on/off +_C.DATA.SSL_MOCOV2_AUG = False + +# SimCLR / MoCo v2 blur augmentation minimum gaussian sigma +_C.DATA.SSL_BLUR_SIGMA_MIN = [0.0, 0.1] + +# SimCLR / MoCo v2 blur augmentation maximum gaussian sigma +_C.DATA.SSL_BLUR_SIGMA_MAX = [0.0, 2.0] + + +# If combine train/val split as training for in21k +_C.DATA.IN22K_TRAINVAL = False + +# If not None, use IN1k as val split when training in21k +_C.DATA.IN22k_VAL_IN1K = "" + +# Large resolution models may use different crop ratios +_C.DATA.IN_VAL_CROP_RATIO = 0.875 # 224/256 = 0.875 + +# don't use real video for kinetics.py +_C.DATA.DUMMY_LOAD = False + +# ---------------------------------------------------------------------------- # +# Optimizer options +# ---------------------------------------------------------------------------- # +_C.SOLVER = CfgNode() + +# Base learning rate. +_C.SOLVER.BASE_LR = 0.1 + +# Learning rate policy (see utils/lr_policy.py for options and examples). +_C.SOLVER.LR_POLICY = "cosine" + +# Final learning rates for 'cosine' policy. +_C.SOLVER.COSINE_END_LR = 0.0 + +# Exponential decay factor. +_C.SOLVER.GAMMA = 0.1 + +# Step size for 'exp' and 'cos' policies (in epochs). +_C.SOLVER.STEP_SIZE = 1 + +# Steps for 'steps_' policies (in epochs). +_C.SOLVER.STEPS = [] + +# Learning rates for 'steps_' policies. +_C.SOLVER.LRS = [] + +# Maximal number of epochs. +_C.SOLVER.MAX_EPOCH = 300 + +# Momentum. +_C.SOLVER.MOMENTUM = 0.9 + +# Momentum dampening. +_C.SOLVER.DAMPENING = 0.0 + +# Nesterov momentum. +_C.SOLVER.NESTEROV = True + +# L2 regularization. +_C.SOLVER.WEIGHT_DECAY = 1e-4 + +# Start the warm up from SOLVER.BASE_LR * SOLVER.WARMUP_FACTOR. +_C.SOLVER.WARMUP_FACTOR = 0.1 + +# Gradually warm up the SOLVER.BASE_LR over this number of epochs. +_C.SOLVER.WARMUP_EPOCHS = 0.0 + +# The start learning rate of the warm up. +_C.SOLVER.WARMUP_START_LR = 0.01 + +# Optimization method. +_C.SOLVER.OPTIMIZING_METHOD = "sgd" + +# Base learning rate is linearly scaled with NUM_SHARDS. +_C.SOLVER.BASE_LR_SCALE_NUM_SHARDS = False + +# If True, start from the peak cosine learning rate after warm up. +_C.SOLVER.COSINE_AFTER_WARMUP = False + +# If True, perform no weight decay on parameter with one dimension (bias term, etc). +_C.SOLVER.ZERO_WD_1D_PARAM = False + +# Clip gradient at this value before optimizer update +_C.SOLVER.CLIP_GRAD_VAL = None + +# Clip gradient at this norm before optimizer update +_C.SOLVER.CLIP_GRAD_L2NORM = None + +# LARS optimizer +_C.SOLVER.LARS_ON = False + +# The layer-wise decay of learning rate. Set to 1. to disable. +_C.SOLVER.LAYER_DECAY = 1.0 + +# Adam's beta +_C.SOLVER.BETAS = (0.9, 0.999) +# ---------------------------------------------------------------------------- # +# Misc options +# ---------------------------------------------------------------------------- # + +# The name of the current task; e.g. "ssl"/"sl" for (self)supervised learning +_C.TASK = "" + +# Number of GPUs to use (applies to both training and testing). +_C.NUM_GPUS = 1 + +# Number of machine to use for the job. +_C.NUM_SHARDS = 1 + +# The index of the current machine. +_C.SHARD_ID = 0 + +# Output basedir. +_C.OUTPUT_DIR = "." + +# Note that non-determinism may still be present due to non-deterministic +# operator implementations in GPU operator libraries. +_C.RNG_SEED = 1 + +# Log period in iters. +_C.LOG_PERIOD = 10 + +# If True, log the model info. +_C.LOG_MODEL_INFO = True + +# Distributed backend. +_C.DIST_BACKEND = "nccl" + +# ---------------------------------------------------------------------------- # +# Benchmark options +# ---------------------------------------------------------------------------- # +_C.BENCHMARK = CfgNode() + +# Number of epochs for data loading benchmark. +_C.BENCHMARK.NUM_EPOCHS = 5 + +# Log period in iters for data loading benchmark. +_C.BENCHMARK.LOG_PERIOD = 100 + +# If True, shuffle dataloader for epoch during benchmark. +_C.BENCHMARK.SHUFFLE = True + + +# ---------------------------------------------------------------------------- # +# Common train/test data loader options +# ---------------------------------------------------------------------------- # +_C.DATA_LOADER = CfgNode() + +# Number of data loader workers per training process. +_C.DATA_LOADER.NUM_WORKERS = 8 + +# Load data to pinned host memory. +_C.DATA_LOADER.PIN_MEMORY = True + +# Enable multi thread decoding. +_C.DATA_LOADER.ENABLE_MULTI_THREAD_DECODE = False + + +# ---------------------------------------------------------------------------- # +# Detection options. +# ---------------------------------------------------------------------------- # +_C.DETECTION = CfgNode() + +# Whether enable video detection. +_C.DETECTION.ENABLE = False + +# Aligned version of RoI. More details can be found at slowfast/models/head_helper.py +_C.DETECTION.ALIGNED = True + +# Spatial scale factor. +_C.DETECTION.SPATIAL_SCALE_FACTOR = 16 + +# RoI tranformation resolution. +_C.DETECTION.ROI_XFORM_RESOLUTION = 7 + + +# ----------------------------------------------------------------------------- +# AVA Dataset options +# ----------------------------------------------------------------------------- +_C.AVA = CfgNode() + +# Directory path of frames. +_C.AVA.FRAME_DIR = "/mnt/fair-flash3-east/ava_trainval_frames.img/" + +# Directory path for files of frame lists. +_C.AVA.FRAME_LIST_DIR = ( + "/mnt/vol/gfsai-flash3-east/ai-group/users/haoqifan/ava/frame_list/" +) + +# Directory path for annotation files. +_C.AVA.ANNOTATION_DIR = ( + "/mnt/vol/gfsai-flash3-east/ai-group/users/haoqifan/ava/frame_list/" +) + +# Filenames of training samples list files. +_C.AVA.TRAIN_LISTS = ["train.csv"] + +# Filenames of test samples list files. +_C.AVA.TEST_LISTS = ["val.csv"] + +# Filenames of box list files for training. Note that we assume files which +# contains predicted boxes will have a suffix "predicted_boxes" in the +# filename. +_C.AVA.TRAIN_GT_BOX_LISTS = ["ava_train_v2.2.csv"] +_C.AVA.TRAIN_PREDICT_BOX_LISTS = [] + +# Filenames of box list files for test. +_C.AVA.TEST_PREDICT_BOX_LISTS = ["ava_val_predicted_boxes.csv"] + +# This option controls the score threshold for the predicted boxes to use. +_C.AVA.DETECTION_SCORE_THRESH = 0.9 + +# If use BGR as the format of input frames. +_C.AVA.BGR = False + +# Training augmentation parameters +# Whether to use color augmentation method. +_C.AVA.TRAIN_USE_COLOR_AUGMENTATION = False + +# Whether to only use PCA jitter augmentation when using color augmentation +# method (otherwise combine with color jitter method). +_C.AVA.TRAIN_PCA_JITTER_ONLY = True + +# Whether to do horizontal flipping during test. +_C.AVA.TEST_FORCE_FLIP = False + +# Whether to use full test set for validation split. +_C.AVA.FULL_TEST_ON_VAL = False + +# The name of the file to the ava label map. +_C.AVA.LABEL_MAP_FILE = "ava_action_list_v2.2_for_activitynet_2019.pbtxt" + +# The name of the file to the ava exclusion. +_C.AVA.EXCLUSION_FILE = "ava_val_excluded_timestamps_v2.2.csv" + +# The name of the file to the ava groundtruth. +_C.AVA.GROUNDTRUTH_FILE = "ava_val_v2.2.csv" + +# Backend to process image, includes `pytorch` and `cv2`. +_C.AVA.IMG_PROC_BACKEND = "cv2" + +# ---------------------------------------------------------------------------- # +# Multigrid training options +# See https://arxiv.org/abs/1912.00998 for details about multigrid training. +# ---------------------------------------------------------------------------- # +_C.MULTIGRID = CfgNode() + +# Multigrid training allows us to train for more epochs with fewer iterations. +# This hyperparameter specifies how many times more epochs to train. +# The default setting in paper trains for 1.5x more epochs than baseline. +_C.MULTIGRID.EPOCH_FACTOR = 1.5 + +# Enable short cycles. +_C.MULTIGRID.SHORT_CYCLE = False +# Short cycle additional spatial dimensions relative to the default crop size. +_C.MULTIGRID.SHORT_CYCLE_FACTORS = [0.5, 0.5**0.5] + +_C.MULTIGRID.LONG_CYCLE = False +# (Temporal, Spatial) dimensions relative to the default shape. +_C.MULTIGRID.LONG_CYCLE_FACTORS = [ + (0.25, 0.5**0.5), + (0.5, 0.5**0.5), + (0.5, 1), + (1, 1), +] + +# While a standard BN computes stats across all examples in a GPU, +# for multigrid training we fix the number of clips to compute BN stats on. +# See https://arxiv.org/abs/1912.00998 for details. +_C.MULTIGRID.BN_BASE_SIZE = 8 + +# Multigrid training epochs are not proportional to actual training time or +# computations, so _C.TRAIN.EVAL_PERIOD leads to too frequent or rare +# evaluation. We use a multigrid-specific rule to determine when to evaluate: +# This hyperparameter defines how many times to evaluate a model per long +# cycle shape. +_C.MULTIGRID.EVAL_FREQ = 3 + +# No need to specify; Set automatically and used as global variables. +_C.MULTIGRID.LONG_CYCLE_SAMPLING_RATE = 0 +_C.MULTIGRID.DEFAULT_B = 0 +_C.MULTIGRID.DEFAULT_T = 0 +_C.MULTIGRID.DEFAULT_S = 0 + +# ----------------------------------------------------------------------------- +# Tensorboard Visualization Options +# ----------------------------------------------------------------------------- +_C.TENSORBOARD = CfgNode() + +# Log to summary writer, this will automatically. +# log loss, lr and metrics during train/eval. +_C.TENSORBOARD.ENABLE = False +# Provide path to prediction results for visualization. +# This is a pickle file of [prediction_tensor, label_tensor] +_C.TENSORBOARD.PREDICTIONS_PATH = "" +# Path to directory for tensorboard logs. +# Default to to cfg.OUTPUT_DIR/runs-{cfg.TRAIN.DATASET}. +_C.TENSORBOARD.LOG_DIR = "" +# Path to a json file providing class_name - id mapping +# in the format {"class_name1": id1, "class_name2": id2, ...}. +# This file must be provided to enable plotting confusion matrix +# by a subset or parent categories. +_C.TENSORBOARD.CLASS_NAMES_PATH = "" + +# Path to a json file for categories -> classes mapping +# in the format {"parent_class": ["child_class1", "child_class2",...], ...}. +_C.TENSORBOARD.CATEGORIES_PATH = "" + +# Config for confusion matrices visualization. +_C.TENSORBOARD.CONFUSION_MATRIX = CfgNode() +# Visualize confusion matrix. +_C.TENSORBOARD.CONFUSION_MATRIX.ENABLE = False +# Figure size of the confusion matrices plotted. +_C.TENSORBOARD.CONFUSION_MATRIX.FIGSIZE = [8, 8] +# Path to a subset of categories to visualize. +# File contains class names separated by newline characters. +_C.TENSORBOARD.CONFUSION_MATRIX.SUBSET_PATH = "" + +# Config for histogram visualization. +_C.TENSORBOARD.HISTOGRAM = CfgNode() +# Visualize histograms. +_C.TENSORBOARD.HISTOGRAM.ENABLE = False +# Path to a subset of classes to plot histograms. +# Class names must be separated by newline characters. +_C.TENSORBOARD.HISTOGRAM.SUBSET_PATH = "" +# Visualize top-k most predicted classes on histograms for each +# chosen true label. +_C.TENSORBOARD.HISTOGRAM.TOPK = 10 +# Figure size of the histograms plotted. +_C.TENSORBOARD.HISTOGRAM.FIGSIZE = [8, 8] + +# Config for layers' weights and activations visualization. +# _C.TENSORBOARD.ENABLE must be True. +_C.TENSORBOARD.MODEL_VIS = CfgNode() + +# If False, skip model visualization. +_C.TENSORBOARD.MODEL_VIS.ENABLE = False + +# If False, skip visualizing model weights. +_C.TENSORBOARD.MODEL_VIS.MODEL_WEIGHTS = False + +# If False, skip visualizing model activations. +_C.TENSORBOARD.MODEL_VIS.ACTIVATIONS = False + +# If False, skip visualizing input videos. +_C.TENSORBOARD.MODEL_VIS.INPUT_VIDEO = False + + +# List of strings containing data about layer names and their indexing to +# visualize weights and activations for. The indexing is meant for +# choosing a subset of activations outputed by a layer for visualization. +# If indexing is not specified, visualize all activations outputed by the layer. +# For each string, layer name and indexing is separated by whitespaces. +# e.g.: [layer1 1,2;1,2, layer2, layer3 150,151;3,4]; this means for each array `arr` +# along the batch dimension in `layer1`, we take arr[[1, 2], [1, 2]] +_C.TENSORBOARD.MODEL_VIS.LAYER_LIST = [] +# Top-k predictions to plot on videos +_C.TENSORBOARD.MODEL_VIS.TOPK_PREDS = 1 +# Colormap to for text boxes and bounding boxes colors +_C.TENSORBOARD.MODEL_VIS.COLORMAP = "Pastel2" +# Config for visualization video inputs with Grad-CAM. +# _C.TENSORBOARD.ENABLE must be True. +_C.TENSORBOARD.MODEL_VIS.GRAD_CAM = CfgNode() +# Whether to run visualization using Grad-CAM technique. +_C.TENSORBOARD.MODEL_VIS.GRAD_CAM.ENABLE = True +# CNN layers to use for Grad-CAM. The number of layers must be equal to +# number of pathway(s). +_C.TENSORBOARD.MODEL_VIS.GRAD_CAM.LAYER_LIST = [] +# If True, visualize Grad-CAM using true labels for each instances. +# If False, use the highest predicted class. +_C.TENSORBOARD.MODEL_VIS.GRAD_CAM.USE_TRUE_LABEL = False +# Colormap to for text boxes and bounding boxes colors +_C.TENSORBOARD.MODEL_VIS.GRAD_CAM.COLORMAP = "viridis" + +# Config for visualization for wrong prediction visualization. +# _C.TENSORBOARD.ENABLE must be True. +_C.TENSORBOARD.WRONG_PRED_VIS = CfgNode() +_C.TENSORBOARD.WRONG_PRED_VIS.ENABLE = False +# Folder tag to origanize model eval videos under. +_C.TENSORBOARD.WRONG_PRED_VIS.TAG = "Incorrectly classified videos." +# Subset of labels to visualize. Only wrong predictions with true labels +# within this subset is visualized. +_C.TENSORBOARD.WRONG_PRED_VIS.SUBSET_PATH = "" + + +# ---------------------------------------------------------------------------- # +# Demo options +# ---------------------------------------------------------------------------- # +_C.DEMO = CfgNode() + +# Run model in DEMO mode. +_C.DEMO.ENABLE = False + +# Path to a json file providing class_name - id mapping +# in the format {"class_name1": id1, "class_name2": id2, ...}. +_C.DEMO.LABEL_FILE_PATH = "" + +# Specify a camera device as input. This will be prioritized +# over input video if set. +# If -1, use input video instead. +_C.DEMO.WEBCAM = -1 + +# Path to input video for demo. +_C.DEMO.INPUT_VIDEO = "" +# Custom width for reading input video data. +_C.DEMO.DISPLAY_WIDTH = 0 +# Custom height for reading input video data. +_C.DEMO.DISPLAY_HEIGHT = 0 +# Path to Detectron2 object detection model configuration, +# only used for detection tasks. +_C.DEMO.DETECTRON2_CFG = "COCO-Detection/faster_rcnn_R_50_FPN_3x.yaml" +# Path to Detectron2 object detection model pre-trained weights. +_C.DEMO.DETECTRON2_WEIGHTS = "detectron2://COCO-Detection/faster_rcnn_R_50_FPN_3x/137849458/model_final_280758.pkl" +# Threshold for choosing predicted bounding boxes by Detectron2. +_C.DEMO.DETECTRON2_THRESH = 0.9 +# Number of overlapping frames between 2 consecutive clips. +# Increase this number for more frequent action predictions. +# The number of overlapping frames cannot be larger than +# half of the sequence length `cfg.DATA.NUM_FRAMES * cfg.DATA.SAMPLING_RATE` +_C.DEMO.BUFFER_SIZE = 0 +# If specified, the visualized outputs will be written this a video file of +# this path. Otherwise, the visualized outputs will be displayed in a window. +_C.DEMO.OUTPUT_FILE = "" +# Frames per second rate for writing to output video file. +# If not set (-1), use fps rate from input file. +_C.DEMO.OUTPUT_FPS = -1 +# Input format from demo video reader ("RGB" or "BGR"). +_C.DEMO.INPUT_FORMAT = "BGR" +# Draw visualization frames in [keyframe_idx - CLIP_VIS_SIZE, keyframe_idx + CLIP_VIS_SIZE] inclusively. +_C.DEMO.CLIP_VIS_SIZE = 10 +# Number of processes to run video visualizer. +_C.DEMO.NUM_VIS_INSTANCES = 2 + +# Path to pre-computed predicted boxes +_C.DEMO.PREDS_BOXES = "" +# Whether to run in with multi-threaded video reader. +_C.DEMO.THREAD_ENABLE = False +# Take one clip for every `DEMO.NUM_CLIPS_SKIP` + 1 for prediction and visualization. +# This is used for fast demo speed by reducing the prediction/visualiztion frequency. +# If -1, take the most recent read clip for visualization. This mode is only supported +# if `DEMO.THREAD_ENABLE` is set to True. +_C.DEMO.NUM_CLIPS_SKIP = 0 +# Path to ground-truth boxes and labels (optional) +_C.DEMO.GT_BOXES = "" +# The starting second of the video w.r.t bounding boxes file. +_C.DEMO.STARTING_SECOND = 900 +# Frames per second of the input video/folder of images. +_C.DEMO.FPS = 30 +# Visualize with top-k predictions or predictions above certain threshold(s). +# Option: {"thres", "top-k"} +_C.DEMO.VIS_MODE = "thres" +# Threshold for common class names. +_C.DEMO.COMMON_CLASS_THRES = 0.7 +# Theshold for uncommon class names. This will not be +# used if `_C.DEMO.COMMON_CLASS_NAMES` is empty. +_C.DEMO.UNCOMMON_CLASS_THRES = 0.3 +# This is chosen based on distribution of examples in +# each classes in AVA dataset. +_C.DEMO.COMMON_CLASS_NAMES = [ + "watch (a person)", + "talk to (e.g., self, a person, a group)", + "listen to (a person)", + "touch (an object)", + "carry/hold (an object)", + "walk", + "sit", + "lie/sleep", + "bend/bow (at the waist)", +] +# Slow-motion rate for the visualization. The visualized portions of the +# video will be played `_C.DEMO.SLOWMO` times slower than usual speed. +_C.DEMO.SLOWMO = 1 + + +def assert_and_infer_cfg(cfg): + # BN assertions. + if cfg.BN.USE_PRECISE_STATS: + assert cfg.BN.NUM_BATCHES_PRECISE >= 0 + # TRAIN assertions. + assert cfg.TRAIN.CHECKPOINT_TYPE in ["pytorch", "caffe2"] + assert cfg.NUM_GPUS == 0 or cfg.TRAIN.BATCH_SIZE % cfg.NUM_GPUS == 0 + + # TEST assertions. + assert cfg.TEST.CHECKPOINT_TYPE in ["pytorch", "caffe2"] + assert cfg.NUM_GPUS == 0 or cfg.TEST.BATCH_SIZE % cfg.NUM_GPUS == 0 + + # RESNET assertions. + assert cfg.RESNET.NUM_GROUPS > 0 + assert cfg.RESNET.WIDTH_PER_GROUP > 0 + assert cfg.RESNET.WIDTH_PER_GROUP % cfg.RESNET.NUM_GROUPS == 0 + + # Execute LR scaling by num_shards. + if cfg.SOLVER.BASE_LR_SCALE_NUM_SHARDS: + cfg.SOLVER.BASE_LR *= cfg.NUM_SHARDS + cfg.SOLVER.WARMUP_START_LR *= cfg.NUM_SHARDS + cfg.SOLVER.COSINE_END_LR *= cfg.NUM_SHARDS + + # General assertions. + assert cfg.SHARD_ID < cfg.NUM_SHARDS + return cfg + + +def get_cfg(): + return _C.clone() + + +def load_config(path_to_config=None): + # Setup cfg. + cfg = get_cfg() + + # Load config from cfg. + if path_to_config is not None: + cfg.merge_from_file(path_to_config) + return cfg diff --git a/src/kabr_tools/utils/slowfast/head.py b/src/kabr_tools/utils/slowfast/head.py new file mode 100644 index 0000000..c16906c --- /dev/null +++ b/src/kabr_tools/utils/slowfast/head.py @@ -0,0 +1,145 @@ +# Copyright (c) Facebook, Inc. and its affiliates. All Rights Reserved. + +"""ResNe(X)t Head helper.""" + +import torch.nn as nn + + +class X3DHead(nn.Module): + """ + X3D head. + This layer performs a fully-connected projection during training, when the + input size is 1x1x1. It performs a convolutional projection during testing + when the input size is larger than 1x1x1. If the inputs are from multiple + different pathways, the inputs will be concatenated after pooling. + """ + + def __init__( + self, + dim_in, + dim_inner, + dim_out, + num_classes, + pool_size, + dropout_rate=0.0, + act_func="softmax", + inplace_relu=True, + eps=1e-5, + bn_mmt=0.1, + norm_module=nn.BatchNorm3d, + bn_lin5_on=False, + ): + """ + The `__init__` method of any subclass should also contain these + arguments. + X3DHead takes a 5-dim feature tensor (BxCxTxHxW) as input. + + Args: + dim_in (float): the channel dimension C of the input. + num_classes (int): the channel dimensions of the output. + pool_size (float): a single entry list of kernel size for + spatiotemporal pooling for the TxHxW dimensions. + dropout_rate (float): dropout rate. If equal to 0.0, perform no + dropout. + act_func (string): activation function to use. 'softmax': applies + softmax on the output. 'sigmoid': applies sigmoid on the output. + inplace_relu (bool): if True, calculate the relu on the original + input without allocating new memory. + eps (float): epsilon for batch norm. + bn_mmt (float): momentum for batch norm. Noted that BN momentum in + PyTorch = 1 - BN momentum in Caffe2. + norm_module (nn.Module): nn.Module for the normalization layer. The + default is nn.BatchNorm3d. + bn_lin5_on (bool): if True, perform normalization on the features + before the classifier. + """ + super(X3DHead, self).__init__() + self.pool_size = pool_size + self.dropout_rate = dropout_rate + self.num_classes = num_classes + self.act_func = act_func + self.eps = eps + self.bn_mmt = bn_mmt + self.inplace_relu = inplace_relu + self.bn_lin5_on = bn_lin5_on + self._construct_head(dim_in, dim_inner, dim_out, norm_module) + + def _construct_head(self, dim_in, dim_inner, dim_out, norm_module): + + self.conv_5 = nn.Conv3d( + dim_in, + dim_inner, + kernel_size=(1, 1, 1), + stride=(1, 1, 1), + padding=(0, 0, 0), + bias=False, + ) + self.conv_5_bn = norm_module( + num_features=dim_inner, eps=self.eps, momentum=self.bn_mmt + ) + self.conv_5_relu = nn.ReLU(self.inplace_relu) + + if self.pool_size is None: + self.avg_pool = nn.AdaptiveAvgPool3d((1, 1, 1)) + else: + self.avg_pool = nn.AvgPool3d(self.pool_size, stride=1) + + self.lin_5 = nn.Conv3d( + dim_inner, + dim_out, + kernel_size=(1, 1, 1), + stride=(1, 1, 1), + padding=(0, 0, 0), + bias=False, + ) + if self.bn_lin5_on: + self.lin_5_bn = norm_module( + num_features=dim_out, eps=self.eps, momentum=self.bn_mmt + ) + self.lin_5_relu = nn.ReLU(self.inplace_relu) + + if self.dropout_rate > 0.0: + self.dropout = nn.Dropout(self.dropout_rate) + # Perform FC in a fully convolutional manner. The FC layer will be + # initialized with a different std comparing to convolutional layers. + self.projection = nn.Linear(dim_out, self.num_classes, bias=True) + + # Softmax for evaluation and testing. + if self.act_func == "softmax": + self.act = nn.Softmax(dim=4) + elif self.act_func == "sigmoid": + self.act = nn.Sigmoid() + else: + raise NotImplementedError( + "{} is not supported as an activation" "function.".format( + self.act_func) + ) + + def forward(self, inputs): + # In its current design the X3D head is only useable for a single + # pathway input. + assert len(inputs) == 1, "Input tensor does not contain 1 pathway" + x = self.conv_5(inputs[0]) + x = self.conv_5_bn(x) + x = self.conv_5_relu(x) + x = self.avg_pool(x) + + x = self.lin_5(x) + if self.bn_lin5_on: + x = self.lin_5_bn(x) + x = self.lin_5_relu(x) + + # (N, C, T, H, W) -> (N, T, H, W, C). + x = x.permute((0, 2, 3, 4, 1)) + # Perform dropout. + if hasattr(self, "dropout"): + x = self.dropout(x) + x = self.projection(x) + + # Performs fully convlutional inference. + if not self.training: + x = self.act(x) + x = x.mean([1, 2, 3]) + + x = x.view(x.shape[0], -1) + return x diff --git a/src/kabr_tools/utils/slowfast/norm.py b/src/kabr_tools/utils/slowfast/norm.py new file mode 100644 index 0000000..34fd479 --- /dev/null +++ b/src/kabr_tools/utils/slowfast/norm.py @@ -0,0 +1,109 @@ +# Copyright (c) Facebook, Inc. and its affiliates. All Rights Reserved. + +"""BatchNorm (BN) utility functions and custom batch-size BN implementations""" + +from functools import partial + +import torch +import torch.nn as nn + +from pytorchvideo.layers.batch_norm import NaiveSyncBatchNorm3d + + +def get_norm(cfg): + """ + Args: + cfg (CfgNode): model building configs, details are in the comments of + the config file. + Returns: + nn.Module: the normalization layer. + """ + if cfg.BN.NORM_TYPE in {"batchnorm", "sync_batchnorm_apex"}: + return nn.BatchNorm3d + elif cfg.BN.NORM_TYPE == "sub_batchnorm": + return partial(SubBatchNorm3d, num_splits=cfg.BN.NUM_SPLITS) + elif cfg.BN.NORM_TYPE == "sync_batchnorm": + return partial( + NaiveSyncBatchNorm3d, + num_sync_devices=cfg.BN.NUM_SYNC_DEVICES, + global_sync=cfg.BN.GLOBAL_SYNC, + ) + else: + raise NotImplementedError( + "Norm type {} is not supported".format(cfg.BN.NORM_TYPE) + ) + + +class SubBatchNorm3d(nn.Module): + """ + The standard BN layer computes stats across all examples in a GPU. In some + cases it is desirable to compute stats across only a subset of examples + (e.g., in multigrid training https://arxiv.org/abs/1912.00998). + SubBatchNorm3d splits the batch dimension into N splits, and run BN on + each of them separately (so that the stats are computed on each subset of + examples (1/N of batch) independently. During evaluation, it aggregates + the stats from all splits into one BN. + """ + + def __init__(self, num_splits, **args): + """ + Args: + num_splits (int): number of splits. + args (list): other arguments. + """ + super(SubBatchNorm3d, self).__init__() + self.num_splits = num_splits + num_features = args["num_features"] + # Keep only one set of weight and bias. + if args.get("affine", True): + self.affine = True + args["affine"] = False + self.weight = torch.nn.Parameter(torch.ones(num_features)) + self.bias = torch.nn.Parameter(torch.zeros(num_features)) + else: + self.affine = False + self.bn = nn.BatchNorm3d(**args) + args["num_features"] = num_features * num_splits + self.split_bn = nn.BatchNorm3d(**args) + + def _get_aggregated_mean_std(self, means, stds, n): + """ + Calculate the aggregated mean and stds. + Args: + means (tensor): mean values. + stds (tensor): standard deviations. + n (int): number of sets of means and stds. + """ + mean = means.view(n, -1).sum(0) / n + std = ( + stds.view(n, -1).sum(0) / n + + ((means.view(n, -1) - mean) ** 2).view(n, -1).sum(0) / n + ) + return mean.detach(), std.detach() + + def aggregate_stats(self): + """ + Synchronize running_mean, and running_var. Call this before eval. + """ + if self.split_bn.track_running_stats: + ( + self.bn.running_mean.data, + self.bn.running_var.data, + ) = self._get_aggregated_mean_std( + self.split_bn.running_mean, + self.split_bn.running_var, + self.num_splits, + ) + + def forward(self, x): + if self.training: + n, c, t, h, w = x.shape + x = x.view(n // self.num_splits, c * self.num_splits, t, h, w) + x = self.split_bn(x) + x = x.view(n, c, t, h, w) + else: + x = self.bn(x) + if self.affine: + x = x * self.weight.view((-1, 1, 1, 1)) + x = x + self.bias.view((-1, 1, 1, 1)) + return x diff --git a/src/kabr_tools/utils/slowfast/resnet.py b/src/kabr_tools/utils/slowfast/resnet.py new file mode 100644 index 0000000..d98c0f2 --- /dev/null +++ b/src/kabr_tools/utils/slowfast/resnet.py @@ -0,0 +1,926 @@ +# Copyright (c) Facebook, Inc. and its affiliates. All Rights Reserved. + +"""Video models.""" + +import torch +import torch.nn as nn +from pytorchvideo.layers.swish import Swish + +def drop_path(x, drop_prob: float = 0.0, training: bool = False): + """ + Stochastic Depth per sample. + """ + if drop_prob == 0.0 or not training: + return x + keep_prob = 1 - drop_prob + shape = (x.shape[0],) + (1,) * ( + x.ndim - 1 + ) # work with diff dim tensors, not just 2D ConvNets + mask = keep_prob + torch.rand(shape, dtype=x.dtype, device=x.device) + mask.floor_() # binarize + output = x.div(keep_prob) * mask + return output + +class Nonlocal(nn.Module): + """ + Builds Non-local Neural Networks as a generic family of building + blocks for capturing long-range dependencies. Non-local Network + computes the response at a position as a weighted sum of the + features at all positions. This building block can be plugged into + many computer vision architectures. + More details in the paper: https://arxiv.org/pdf/1711.07971.pdf + """ + + def __init__( + self, + dim, + dim_inner, + pool_size=None, + instantiation="softmax", + zero_init_final_conv=False, + zero_init_final_norm=True, + norm_eps=1e-5, + norm_momentum=0.1, + norm_module=nn.BatchNorm3d, + ): + """ + Args: + dim (int): number of dimension for the input. + dim_inner (int): number of dimension inside of the Non-local block. + pool_size (list): the kernel size of spatial temporal pooling, + temporal pool kernel size, spatial pool kernel size, spatial + pool kernel size in order. By default pool_size is None, + then there would be no pooling used. + instantiation (string): supports two different instantiation method: + "dot_product": normalizing correlation matrix with L2. + "softmax": normalizing correlation matrix with Softmax. + zero_init_final_conv (bool): If true, zero initializing the final + convolution of the Non-local block. + zero_init_final_norm (bool): + If true, zero initializing the final batch norm of the Non-local + block. + norm_module (nn.Module): nn.Module for the normalization layer. The + default is nn.BatchNorm3d. + """ + super(Nonlocal, self).__init__() + self.dim = dim + self.dim_inner = dim_inner + self.pool_size = pool_size + self.instantiation = instantiation + self.use_pool = ( + False if pool_size is None else any((size > 1 for size in pool_size)) + ) + self.norm_eps = norm_eps + self.norm_momentum = norm_momentum + self._construct_nonlocal( + zero_init_final_conv, zero_init_final_norm, norm_module + ) + + def _construct_nonlocal( + self, zero_init_final_conv, zero_init_final_norm, norm_module + ): + # Three convolution heads: theta, phi, and g. + self.conv_theta = nn.Conv3d( + self.dim, self.dim_inner, kernel_size=1, stride=1, padding=0 + ) + self.conv_phi = nn.Conv3d( + self.dim, self.dim_inner, kernel_size=1, stride=1, padding=0 + ) + self.conv_g = nn.Conv3d( + self.dim, self.dim_inner, kernel_size=1, stride=1, padding=0 + ) + + # Final convolution output. + self.conv_out = nn.Conv3d( + self.dim_inner, self.dim, kernel_size=1, stride=1, padding=0 + ) + # Zero initializing the final convolution output. + self.conv_out.zero_init = zero_init_final_conv + + # TODO: change the name to `norm` + self.bn = norm_module( + num_features=self.dim, + eps=self.norm_eps, + momentum=self.norm_momentum, + ) + # Zero initializing the final bn. + self.bn.transform_final_bn = zero_init_final_norm + + # Optional to add the spatial-temporal pooling. + if self.use_pool: + self.pool = nn.MaxPool3d( + kernel_size=self.pool_size, + stride=self.pool_size, + padding=[0, 0, 0], + ) + + def forward(self, x): + x_identity = x + N, C, T, H, W = x.size() + + theta = self.conv_theta(x) + + # Perform temporal-spatial pooling to reduce the computation. + if self.use_pool: + x = self.pool(x) + + phi = self.conv_phi(x) + g = self.conv_g(x) + + theta = theta.view(N, self.dim_inner, -1) + phi = phi.view(N, self.dim_inner, -1) + g = g.view(N, self.dim_inner, -1) + + # (N, C, TxHxW) * (N, C, TxHxW) => (N, TxHxW, TxHxW). + theta_phi = torch.einsum("nct,ncp->ntp", (theta, phi)) + # For original Non-local paper, there are two main ways to normalize + # the affinity tensor: + # 1) Softmax normalization (norm on exp). + # 2) dot_product normalization. + if self.instantiation == "softmax": + # Normalizing the affinity tensor theta_phi before softmax. + theta_phi = theta_phi * (self.dim_inner**-0.5) + theta_phi = nn.functional.softmax(theta_phi, dim=2) + elif self.instantiation == "dot_product": + spatial_temporal_dim = theta_phi.shape[2] + theta_phi = theta_phi / spatial_temporal_dim + else: + raise NotImplementedError("Unknown norm type {}".format(self.instantiation)) + + # (N, TxHxW, TxHxW) * (N, C, TxHxW) => (N, C, TxHxW). + theta_phi_g = torch.einsum("ntg,ncg->nct", (theta_phi, g)) + + # (N, C, TxHxW) => (N, C, T, H, W). + theta_phi_g = theta_phi_g.view(N, self.dim_inner, T, H, W) + + p = self.conv_out(theta_phi_g) + p = self.bn(p) + return x_identity + p + +class SE(nn.Module): + """Squeeze-and-Excitation (SE) block w/ Swish: AvgPool, FC, Swish, FC, Sigmoid.""" + + def _round_width(self, width, multiplier, min_width=8, divisor=8): + """ + Round width of filters based on width multiplier + Args: + width (int): the channel dimensions of the input. + multiplier (float): the multiplication factor. + min_width (int): the minimum width after multiplication. + divisor (int): the new width should be dividable by divisor. + """ + if not multiplier: + return width + + width *= multiplier + min_width = min_width or divisor + width_out = max(min_width, int(width + divisor / 2) // divisor * divisor) + if width_out < 0.9 * width: + width_out += divisor + return int(width_out) + + def __init__(self, dim_in, ratio, relu_act=True): + """ + Args: + dim_in (int): the channel dimensions of the input. + ratio (float): the channel reduction ratio for squeeze. + relu_act (bool): whether to use ReLU activation instead + of Swish (default). + divisor (int): the new width should be dividable by divisor. + """ + super(SE, self).__init__() + self.avg_pool = nn.AdaptiveAvgPool3d((1, 1, 1)) + dim_fc = self._round_width(dim_in, ratio) + self.fc1 = nn.Conv3d(dim_in, dim_fc, 1, bias=True) + self.fc1_act = nn.ReLU() if relu_act else Swish() + self.fc2 = nn.Conv3d(dim_fc, dim_in, 1, bias=True) + + self.fc2_sig = nn.Sigmoid() + + def forward(self, x): + x_in = x + for module in self.children(): + x = module(x) + return x_in * x + + + + +def get_trans_func(name): + """ + Retrieves the transformation module by name. + """ + trans_funcs = { + "bottleneck_transform": BottleneckTransform, + "basic_transform": BasicTransform, + "x3d_transform": X3DTransform, + } + assert ( + name in trans_funcs.keys() + ), "Transformation function '{}' not supported".format(name) + return trans_funcs[name] + + +class BasicTransform(nn.Module): + """ + Basic transformation: Tx3x3, 1x3x3, where T is the size of temporal kernel. + """ + + def __init__( + self, + dim_in, + dim_out, + temp_kernel_size, + stride, + dim_inner=None, + num_groups=1, + stride_1x1=None, + inplace_relu=True, + eps=1e-5, + bn_mmt=0.1, + dilation=1, + norm_module=nn.BatchNorm3d, + block_idx=0, + ): + """ + Args: + dim_in (int): the channel dimensions of the input. + dim_out (int): the channel dimension of the output. + temp_kernel_size (int): the temporal kernel sizes of the first + convolution in the basic block. + stride (int): the stride of the bottleneck. + dim_inner (None): the inner dimension would not be used in + BasicTransform. + num_groups (int): number of groups for the convolution. Number of + group is always 1 for BasicTransform. + stride_1x1 (None): stride_1x1 will not be used in BasicTransform. + inplace_relu (bool): if True, calculate the relu on the original + input without allocating new memory. + eps (float): epsilon for batch norm. + bn_mmt (float): momentum for batch norm. Noted that BN momentum in + PyTorch = 1 - BN momentum in Caffe2. + norm_module (nn.Module): nn.Module for the normalization layer. The + default is nn.BatchNorm3d. + """ + super(BasicTransform, self).__init__() + self.temp_kernel_size = temp_kernel_size + self._inplace_relu = inplace_relu + self._eps = eps + self._bn_mmt = bn_mmt + self._construct(dim_in, dim_out, stride, dilation, norm_module) + + def _construct(self, dim_in, dim_out, stride, dilation, norm_module): + # Tx3x3, BN, ReLU. + self.a = nn.Conv3d( + dim_in, + dim_out, + kernel_size=[self.temp_kernel_size, 3, 3], + stride=[1, stride, stride], + padding=[int(self.temp_kernel_size // 2), 1, 1], + bias=False, + ) + self.a_bn = norm_module( + num_features=dim_out, eps=self._eps, momentum=self._bn_mmt + ) + self.a_relu = nn.ReLU(inplace=self._inplace_relu) + # 1x3x3, BN. + self.b = nn.Conv3d( + dim_out, + dim_out, + kernel_size=[1, 3, 3], + stride=[1, 1, 1], + padding=[0, dilation, dilation], + dilation=[1, dilation, dilation], + bias=False, + ) + + self.b.final_conv = True + + self.b_bn = norm_module( + num_features=dim_out, eps=self._eps, momentum=self._bn_mmt + ) + + self.b_bn.transform_final_bn = True + + def forward(self, x): + x = self.a(x) + x = self.a_bn(x) + x = self.a_relu(x) + + x = self.b(x) + x = self.b_bn(x) + return x + + +class X3DTransform(nn.Module): + """ + X3D transformation: 1x1x1, Tx3x3 (channelwise, num_groups=dim_in), 1x1x1, + augmented with (optional) SE (squeeze-excitation) on the 3x3x3 output. + T is the temporal kernel size (defaulting to 3) + """ + + def __init__( + self, + dim_in, + dim_out, + temp_kernel_size, + stride, + dim_inner, + num_groups, + stride_1x1=False, + inplace_relu=True, + eps=1e-5, + bn_mmt=0.1, + dilation=1, + norm_module=nn.BatchNorm3d, + se_ratio=0.0625, + swish_inner=True, + block_idx=0, + ): + """ + Args: + dim_in (int): the channel dimensions of the input. + dim_out (int): the channel dimension of the output. + temp_kernel_size (int): the temporal kernel sizes of the middle + convolution in the bottleneck. + stride (int): the stride of the bottleneck. + dim_inner (int): the inner dimension of the block. + num_groups (int): number of groups for the convolution. num_groups=1 + is for standard ResNet like networks, and num_groups>1 is for + ResNeXt like networks. + stride_1x1 (bool): if True, apply stride to 1x1 conv, otherwise + apply stride to the 3x3 conv. + inplace_relu (bool): if True, calculate the relu on the original + input without allocating new memory. + eps (float): epsilon for batch norm. + bn_mmt (float): momentum for batch norm. Noted that BN momentum in + PyTorch = 1 - BN momentum in Caffe2. + dilation (int): size of dilation. + norm_module (nn.Module): nn.Module for the normalization layer. The + default is nn.BatchNorm3d. + se_ratio (float): if > 0, apply SE to the Tx3x3 conv, with the SE + channel dimensionality being se_ratio times the Tx3x3 conv dim. + swish_inner (bool): if True, apply swish to the Tx3x3 conv, otherwise + apply ReLU to the Tx3x3 conv. + """ + super(X3DTransform, self).__init__() + self.temp_kernel_size = temp_kernel_size + self._inplace_relu = inplace_relu + self._eps = eps + self._bn_mmt = bn_mmt + self._se_ratio = se_ratio + self._swish_inner = swish_inner + self._stride_1x1 = stride_1x1 + self._block_idx = block_idx + self._construct( + dim_in, + dim_out, + stride, + dim_inner, + num_groups, + dilation, + norm_module, + ) + + def _construct( + self, + dim_in, + dim_out, + stride, + dim_inner, + num_groups, + dilation, + norm_module, + ): + (str1x1, str3x3) = (stride, 1) if self._stride_1x1 else (1, stride) + + # 1x1x1, BN, ReLU. + self.a = nn.Conv3d( + dim_in, + dim_inner, + kernel_size=[1, 1, 1], + stride=[1, str1x1, str1x1], + padding=[0, 0, 0], + bias=False, + ) + self.a_bn = norm_module( + num_features=dim_inner, eps=self._eps, momentum=self._bn_mmt + ) + self.a_relu = nn.ReLU(inplace=self._inplace_relu) + + # Tx3x3, BN, ReLU. + self.b = nn.Conv3d( + dim_inner, + dim_inner, + [self.temp_kernel_size, 3, 3], + stride=[1, str3x3, str3x3], + padding=[int(self.temp_kernel_size // 2), dilation, dilation], + groups=num_groups, + bias=False, + dilation=[1, dilation, dilation], + ) + self.b_bn = norm_module( + num_features=dim_inner, eps=self._eps, momentum=self._bn_mmt + ) + + # Apply SE attention or not + use_se = True if (self._block_idx + 1) % 2 else False + if self._se_ratio > 0.0 and use_se: + self.se = SE(dim_inner, self._se_ratio) + + if self._swish_inner: + self.b_relu = Swish() + else: + self.b_relu = nn.ReLU(inplace=self._inplace_relu) + + # 1x1x1, BN. + self.c = nn.Conv3d( + dim_inner, + dim_out, + kernel_size=[1, 1, 1], + stride=[1, 1, 1], + padding=[0, 0, 0], + bias=False, + ) + self.c_bn = norm_module( + num_features=dim_out, eps=self._eps, momentum=self._bn_mmt + ) + self.c_bn.transform_final_bn = True + + def forward(self, x): + for block in self.children(): + x = block(x) + return x + + +class BottleneckTransform(nn.Module): + """ + Bottleneck transformation: Tx1x1, 1x3x3, 1x1x1, where T is the size of + temporal kernel. + """ + + def __init__( + self, + dim_in, + dim_out, + temp_kernel_size, + stride, + dim_inner, + num_groups, + stride_1x1=False, + inplace_relu=True, + eps=1e-5, + bn_mmt=0.1, + dilation=1, + norm_module=nn.BatchNorm3d, + block_idx=0, + ): + """ + Args: + dim_in (int): the channel dimensions of the input. + dim_out (int): the channel dimension of the output. + temp_kernel_size (int): the temporal kernel sizes of the first + convolution in the bottleneck. + stride (int): the stride of the bottleneck. + dim_inner (int): the inner dimension of the block. + num_groups (int): number of groups for the convolution. num_groups=1 + is for standard ResNet like networks, and num_groups>1 is for + ResNeXt like networks. + stride_1x1 (bool): if True, apply stride to 1x1 conv, otherwise + apply stride to the 3x3 conv. + inplace_relu (bool): if True, calculate the relu on the original + input without allocating new memory. + eps (float): epsilon for batch norm. + bn_mmt (float): momentum for batch norm. Noted that BN momentum in + PyTorch = 1 - BN momentum in Caffe2. + dilation (int): size of dilation. + norm_module (nn.Module): nn.Module for the normalization layer. The + default is nn.BatchNorm3d. + """ + super(BottleneckTransform, self).__init__() + self.temp_kernel_size = temp_kernel_size + self._inplace_relu = inplace_relu + self._eps = eps + self._bn_mmt = bn_mmt + self._stride_1x1 = stride_1x1 + self._construct( + dim_in, + dim_out, + stride, + dim_inner, + num_groups, + dilation, + norm_module, + ) + + def _construct( + self, + dim_in, + dim_out, + stride, + dim_inner, + num_groups, + dilation, + norm_module, + ): + (str1x1, str3x3) = (stride, 1) if self._stride_1x1 else (1, stride) + + # Tx1x1, BN, ReLU. + self.a = nn.Conv3d( + dim_in, + dim_inner, + kernel_size=[self.temp_kernel_size, 1, 1], + stride=[1, str1x1, str1x1], + padding=[int(self.temp_kernel_size // 2), 0, 0], + bias=False, + ) + self.a_bn = norm_module( + num_features=dim_inner, eps=self._eps, momentum=self._bn_mmt + ) + self.a_relu = nn.ReLU(inplace=self._inplace_relu) + + # 1x3x3, BN, ReLU. + self.b = nn.Conv3d( + dim_inner, + dim_inner, + [1, 3, 3], + stride=[1, str3x3, str3x3], + padding=[0, dilation, dilation], + groups=num_groups, + bias=False, + dilation=[1, dilation, dilation], + ) + self.b_bn = norm_module( + num_features=dim_inner, eps=self._eps, momentum=self._bn_mmt + ) + self.b_relu = nn.ReLU(inplace=self._inplace_relu) + + # 1x1x1, BN. + self.c = nn.Conv3d( + dim_inner, + dim_out, + kernel_size=[1, 1, 1], + stride=[1, 1, 1], + padding=[0, 0, 0], + bias=False, + ) + self.c.final_conv = True + + self.c_bn = norm_module( + num_features=dim_out, eps=self._eps, momentum=self._bn_mmt + ) + self.c_bn.transform_final_bn = True + + def forward(self, x): + # Explicitly forward every layer. + # Branch2a. + x = self.a(x) + x = self.a_bn(x) + x = self.a_relu(x) + + # Branch2b. + x = self.b(x) + x = self.b_bn(x) + x = self.b_relu(x) + + # Branch2c + x = self.c(x) + x = self.c_bn(x) + return x + + +class ResBlock(nn.Module): + """ + Residual block. + """ + + def __init__( + self, + dim_in, + dim_out, + temp_kernel_size, + stride, + trans_func, + dim_inner, + num_groups=1, + stride_1x1=False, + inplace_relu=True, + eps=1e-5, + bn_mmt=0.1, + dilation=1, + norm_module=nn.BatchNorm3d, + block_idx=0, + drop_connect_rate=0.0, + ): + """ + ResBlock class constructs redisual blocks. More details can be found in: + Kaiming He, Xiangyu Zhang, Shaoqing Ren, and Jian Sun. + "Deep residual learning for image recognition." + https://arxiv.org/abs/1512.03385 + Args: + dim_in (int): the channel dimensions of the input. + dim_out (int): the channel dimension of the output. + temp_kernel_size (int): the temporal kernel sizes of the middle + convolution in the bottleneck. + stride (int): the stride of the bottleneck. + trans_func (string): transform function to be used to construct the + bottleneck. + dim_inner (int): the inner dimension of the block. + num_groups (int): number of groups for the convolution. num_groups=1 + is for standard ResNet like networks, and num_groups>1 is for + ResNeXt like networks. + stride_1x1 (bool): if True, apply stride to 1x1 conv, otherwise + apply stride to the 3x3 conv. + inplace_relu (bool): calculate the relu on the original input + without allocating new memory. + eps (float): epsilon for batch norm. + bn_mmt (float): momentum for batch norm. Noted that BN momentum in + PyTorch = 1 - BN momentum in Caffe2. + dilation (int): size of dilation. + norm_module (nn.Module): nn.Module for the normalization layer. The + default is nn.BatchNorm3d. + drop_connect_rate (float): basic rate at which blocks are dropped, + linearly increases from input to output blocks. + """ + super(ResBlock, self).__init__() + self._inplace_relu = inplace_relu + self._eps = eps + self._bn_mmt = bn_mmt + self._drop_connect_rate = drop_connect_rate + self._construct( + dim_in, + dim_out, + temp_kernel_size, + stride, + trans_func, + dim_inner, + num_groups, + stride_1x1, + inplace_relu, + dilation, + norm_module, + block_idx, + ) + + def _construct( + self, + dim_in, + dim_out, + temp_kernel_size, + stride, + trans_func, + dim_inner, + num_groups, + stride_1x1, + inplace_relu, + dilation, + norm_module, + block_idx, + ): + # Use skip connection with projection if dim or res change. + if (dim_in != dim_out) or (stride != 1): + self.branch1 = nn.Conv3d( + dim_in, + dim_out, + kernel_size=1, + stride=[1, stride, stride], + padding=0, + bias=False, + dilation=1, + ) + self.branch1_bn = norm_module( + num_features=dim_out, eps=self._eps, momentum=self._bn_mmt + ) + self.branch2 = trans_func( + dim_in, + dim_out, + temp_kernel_size, + stride, + dim_inner, + num_groups, + stride_1x1=stride_1x1, + inplace_relu=inplace_relu, + dilation=dilation, + norm_module=norm_module, + block_idx=block_idx, + ) + self.relu = nn.ReLU(self._inplace_relu) + + def forward(self, x): + f_x = self.branch2(x) + if self.training and self._drop_connect_rate > 0.0: + f_x = drop_path(f_x, self._drop_connect_rate) + if hasattr(self, "branch1"): + x = self.branch1_bn(self.branch1(x)) + f_x + else: + x = x + f_x + x = self.relu(x) + return x + + +class ResStage(nn.Module): + """ + Stage of 3D ResNet. It expects to have one or more tensors as input for + single pathway (C2D, I3D, Slow), and multi-pathway (SlowFast) cases. + More details can be found here: + + Christoph Feichtenhofer, Haoqi Fan, Jitendra Malik, and Kaiming He. + "SlowFast networks for video recognition." + https://arxiv.org/pdf/1812.03982.pdf + """ + + def __init__( + self, + dim_in, + dim_out, + stride, + temp_kernel_sizes, + num_blocks, + dim_inner, + num_groups, + num_block_temp_kernel, + nonlocal_inds, + nonlocal_group, + nonlocal_pool, + dilation, + instantiation="softmax", + trans_func_name="bottleneck_transform", + stride_1x1=False, + inplace_relu=True, + norm_module=nn.BatchNorm3d, + drop_connect_rate=0.0, + ): + """ + The `__init__` method of any subclass should also contain these arguments. + ResStage builds p streams, where p can be greater or equal to one. + Args: + dim_in (list): list of p the channel dimensions of the input. + Different channel dimensions control the input dimension of + different pathways. + dim_out (list): list of p the channel dimensions of the output. + Different channel dimensions control the input dimension of + different pathways. + temp_kernel_sizes (list): list of the p temporal kernel sizes of the + convolution in the bottleneck. Different temp_kernel_sizes + control different pathway. + stride (list): list of the p strides of the bottleneck. Different + stride control different pathway. + num_blocks (list): list of p numbers of blocks for each of the + pathway. + dim_inner (list): list of the p inner channel dimensions of the + input. Different channel dimensions control the input dimension + of different pathways. + num_groups (list): list of number of p groups for the convolution. + num_groups=1 is for standard ResNet like networks, and + num_groups>1 is for ResNeXt like networks. + num_block_temp_kernel (list): extent the temp_kernel_sizes to + num_block_temp_kernel blocks, then fill temporal kernel size + of 1 for the rest of the layers. + nonlocal_inds (list): If the tuple is empty, no nonlocal layer will + be added. If the tuple is not empty, add nonlocal layers after + the index-th block. + dilation (list): size of dilation for each pathway. + nonlocal_group (list): list of number of p nonlocal groups. Each + number controls how to fold temporal dimension to batch + dimension before applying nonlocal transformation. + https://github.com/facebookresearch/video-nonlocal-net. + instantiation (string): different instantiation for nonlocal layer. + Supports two different instantiation method: + "dot_product": normalizing correlation matrix with L2. + "softmax": normalizing correlation matrix with Softmax. + trans_func_name (string): name of the the transformation function apply + on the network. + norm_module (nn.Module): nn.Module for the normalization layer. The + default is nn.BatchNorm3d. + drop_connect_rate (float): basic rate at which blocks are dropped, + linearly increases from input to output blocks. + """ + super(ResStage, self).__init__() + assert all( + ( + num_block_temp_kernel[i] <= num_blocks[i] + for i in range(len(temp_kernel_sizes)) + ) + ) + self.num_blocks = num_blocks + self.nonlocal_group = nonlocal_group + self._drop_connect_rate = drop_connect_rate + self.temp_kernel_sizes = [ + (temp_kernel_sizes[i] * num_blocks[i])[: num_block_temp_kernel[i]] + + [1] * (num_blocks[i] - num_block_temp_kernel[i]) + for i in range(len(temp_kernel_sizes)) + ] + assert ( + len( + { + len(dim_in), + len(dim_out), + len(temp_kernel_sizes), + len(stride), + len(num_blocks), + len(dim_inner), + len(num_groups), + len(num_block_temp_kernel), + len(nonlocal_inds), + len(nonlocal_group), + } + ) + == 1 + ) + self.num_pathways = len(self.num_blocks) + self._construct( + dim_in, + dim_out, + stride, + dim_inner, + num_groups, + trans_func_name, + stride_1x1, + inplace_relu, + nonlocal_inds, + nonlocal_pool, + instantiation, + dilation, + norm_module, + ) + + def _construct( + self, + dim_in, + dim_out, + stride, + dim_inner, + num_groups, + trans_func_name, + stride_1x1, + inplace_relu, + nonlocal_inds, + nonlocal_pool, + instantiation, + dilation, + norm_module, + ): + for pathway in range(self.num_pathways): + for i in range(self.num_blocks[pathway]): + # Retrieve the transformation function. + trans_func = get_trans_func(trans_func_name) + # Construct the block. + res_block = ResBlock( + dim_in[pathway] if i == 0 else dim_out[pathway], + dim_out[pathway], + self.temp_kernel_sizes[pathway][i], + stride[pathway] if i == 0 else 1, + trans_func, + dim_inner[pathway], + num_groups[pathway], + stride_1x1=stride_1x1, + inplace_relu=inplace_relu, + dilation=dilation[pathway], + norm_module=norm_module, + block_idx=i, + drop_connect_rate=self._drop_connect_rate, + ) + self.add_module("pathway{}_res{}".format( + pathway, i), res_block) + if i in nonlocal_inds[pathway]: + nln = Nonlocal( + dim_out[pathway], + dim_out[pathway] // 2, + nonlocal_pool[pathway], + instantiation=instantiation, + norm_module=norm_module, + ) + self.add_module( + "pathway{}_nonlocal{}".format(pathway, i), nln) + + def forward(self, inputs): + output = [] + for pathway in range(self.num_pathways): + x = inputs[pathway] + for i in range(self.num_blocks[pathway]): + m = getattr(self, "pathway{}_res{}".format(pathway, i)) + x = m(x) + if hasattr(self, "pathway{}_nonlocal{}".format(pathway, i)): + nln = getattr( + self, "pathway{}_nonlocal{}".format(pathway, i)) + b, c, t, h, w = x.shape + if self.nonlocal_group[pathway] > 1: + # Fold temporal dimension into batch dimension. + x = x.permute(0, 2, 1, 3, 4) + x = x.reshape( + b * self.nonlocal_group[pathway], + t // self.nonlocal_group[pathway], + c, + h, + w, + ) + x = x.permute(0, 2, 1, 3, 4) + x = nln(x) + if self.nonlocal_group[pathway] > 1: + # Fold back to temporal dimension. + x = x.permute(0, 2, 1, 3, 4) + x = x.reshape(b, t, c, h, w) + x = x.permute(0, 2, 1, 3, 4) + output.append(x) + + return output diff --git a/src/kabr_tools/utils/slowfast/stem.py b/src/kabr_tools/utils/slowfast/stem.py new file mode 100644 index 0000000..beda2ec --- /dev/null +++ b/src/kabr_tools/utils/slowfast/stem.py @@ -0,0 +1,321 @@ +# Copyright (c) Facebook, Inc. and its affiliates. All Rights Reserved. + +"""ResNe(X)t 3D stem helper.""" + +import torch.nn as nn + + +def get_stem_func(name): + """ + Retrieves the stem module by name. + """ + trans_funcs = {"x3d_stem": X3DStem, "basic_stem": ResNetBasicStem} + assert ( + name in trans_funcs.keys() + ), "Transformation function '{}' not supported".format(name) + return trans_funcs[name] + + +class VideoModelStem(nn.Module): + """ + Video 3D stem module. Provides stem operations of Conv, BN, ReLU, MaxPool + on input data tensor for one or multiple pathways. + """ + + def __init__( + self, + dim_in, + dim_out, + kernel, + stride, + padding, + inplace_relu=True, + eps=1e-5, + bn_mmt=0.1, + norm_module=nn.BatchNorm3d, + stem_func_name="basic_stem", + ): + """ + The `__init__` method of any subclass should also contain these + arguments. List size of 1 for single pathway models (C2D, I3D, Slow + and etc), list size of 2 for two pathway models (SlowFast). + + Args: + dim_in (list): the list of channel dimensions of the inputs. + dim_out (list): the output dimension of the convolution in the stem + layer. + kernel (list): the kernels' size of the convolutions in the stem + layers. Temporal kernel size, height kernel size, width kernel + size in order. + stride (list): the stride sizes of the convolutions in the stem + layer. Temporal kernel stride, height kernel size, width kernel + size in order. + padding (list): the paddings' sizes of the convolutions in the stem + layer. Temporal padding size, height padding size, width padding + size in order. + inplace_relu (bool): calculate the relu on the original input + without allocating new memory. + eps (float): epsilon for batch norm. + bn_mmt (float): momentum for batch norm. Noted that BN momentum in + PyTorch = 1 - BN momentum in Caffe2. + norm_module (nn.Module): nn.Module for the normalization layer. The + default is nn.BatchNorm3d. + stem_func_name (string): name of the the stem function applied on + input to the network. + """ + super(VideoModelStem, self).__init__() + + assert ( + len( + { + len(dim_in), + len(dim_out), + len(kernel), + len(stride), + len(padding), + } + ) + == 1 + ), "Input pathway dimensions are not consistent. {} {} {} {} {}".format( + len(dim_in), + len(dim_out), + len(kernel), + len(stride), + len(padding), + ) + + self.num_pathways = len(dim_in) + self.kernel = kernel + self.stride = stride + self.padding = padding + self.inplace_relu = inplace_relu + self.eps = eps + self.bn_mmt = bn_mmt + # Construct the stem layer. + self._construct_stem(dim_in, dim_out, norm_module, stem_func_name) + + def _construct_stem(self, dim_in, dim_out, norm_module, stem_func_name): + trans_func = get_stem_func(stem_func_name) + + for pathway in range(len(dim_in)): + stem = trans_func( + dim_in[pathway], + dim_out[pathway], + self.kernel[pathway], + self.stride[pathway], + self.padding[pathway], + self.inplace_relu, + self.eps, + self.bn_mmt, + norm_module, + ) + self.add_module("pathway{}_stem".format(pathway), stem) + + def forward(self, x): + assert ( + len(x) == self.num_pathways + ), "Input tensor does not contain {} pathway".format(self.num_pathways) + # use a new list, don't modify in-place the x list, which is bad for activation checkpointing. + y = [] + for pathway in range(len(x)): + m = getattr(self, "pathway{}_stem".format(pathway)) + y.append(m(x[pathway])) + return y + + +class ResNetBasicStem(nn.Module): + """ + ResNe(X)t 3D stem module. + Performs spatiotemporal Convolution, BN, and Relu following by a + spatiotemporal pooling. + """ + + def __init__( + self, + dim_in, + dim_out, + kernel, + stride, + padding, + inplace_relu=True, + eps=1e-5, + bn_mmt=0.1, + norm_module=nn.BatchNorm3d, + ): + """ + The `__init__` method of any subclass should also contain these arguments. + + Args: + dim_in (int): the channel dimension of the input. Normally 3 is used + for rgb input, and 2 or 3 is used for optical flow input. + dim_out (int): the output dimension of the convolution in the stem + layer. + kernel (list): the kernel size of the convolution in the stem layer. + temporal kernel size, height kernel size, width kernel size in + order. + stride (list): the stride size of the convolution in the stem layer. + temporal kernel stride, height kernel size, width kernel size in + order. + padding (int): the padding size of the convolution in the stem + layer, temporal padding size, height padding size, width + padding size in order. + inplace_relu (bool): calculate the relu on the original input + without allocating new memory. + eps (float): epsilon for batch norm. + bn_mmt (float): momentum for batch norm. Noted that BN momentum in + PyTorch = 1 - BN momentum in Caffe2. + norm_module (nn.Module): nn.Module for the normalization layer. The + default is nn.BatchNorm3d. + """ + super(ResNetBasicStem, self).__init__() + self.kernel = kernel + self.stride = stride + self.padding = padding + self.inplace_relu = inplace_relu + self.eps = eps + self.bn_mmt = bn_mmt + # Construct the stem layer. + self._construct_stem(dim_in, dim_out, norm_module) + + def _construct_stem(self, dim_in, dim_out, norm_module): + self.conv = nn.Conv3d( + dim_in, + dim_out, + self.kernel, + stride=self.stride, + padding=self.padding, + bias=False, + ) + self.bn = norm_module(num_features=dim_out, + eps=self.eps, momentum=self.bn_mmt) + self.relu = nn.ReLU(self.inplace_relu) + self.pool_layer = nn.MaxPool3d( + kernel_size=[1, 3, 3], stride=[1, 2, 2], padding=[0, 1, 1] + ) + + def forward(self, x): + x = self.conv(x) + x = self.bn(x) + x = self.relu(x) + x = self.pool_layer(x) + return x + + +class X3DStem(nn.Module): + """ + X3D's 3D stem module. + Performs a spatial followed by a depthwise temporal Convolution, BN, and Relu following by a + spatiotemporal pooling. + """ + + def __init__( + self, + dim_in, + dim_out, + kernel, + stride, + padding, + inplace_relu=True, + eps=1e-5, + bn_mmt=0.1, + norm_module=nn.BatchNorm3d, + ): + """ + The `__init__` method of any subclass should also contain these arguments. + + Args: + dim_in (int): the channel dimension of the input. Normally 3 is used + for rgb input, and 2 or 3 is used for optical flow input. + dim_out (int): the output dimension of the convolution in the stem + layer. + kernel (list): the kernel size of the convolution in the stem layer. + temporal kernel size, height kernel size, width kernel size in + order. + stride (list): the stride size of the convolution in the stem layer. + temporal kernel stride, height kernel size, width kernel size in + order. + padding (int): the padding size of the convolution in the stem + layer, temporal padding size, height padding size, width + padding size in order. + inplace_relu (bool): calculate the relu on the original input + without allocating new memory. + eps (float): epsilon for batch norm. + bn_mmt (float): momentum for batch norm. Noted that BN momentum in + PyTorch = 1 - BN momentum in Caffe2. + norm_module (nn.Module): nn.Module for the normalization layer. The + default is nn.BatchNorm3d. + """ + super(X3DStem, self).__init__() + self.kernel = kernel + self.stride = stride + self.padding = padding + self.inplace_relu = inplace_relu + self.eps = eps + self.bn_mmt = bn_mmt + # Construct the stem layer. + self._construct_stem(dim_in, dim_out, norm_module) + + def _construct_stem(self, dim_in, dim_out, norm_module): + self.conv_xy = nn.Conv3d( + dim_in, + dim_out, + kernel_size=(1, self.kernel[1], self.kernel[2]), + stride=(1, self.stride[1], self.stride[2]), + padding=(0, self.padding[1], self.padding[2]), + bias=False, + ) + self.conv = nn.Conv3d( + dim_out, + dim_out, + kernel_size=(self.kernel[0], 1, 1), + stride=(self.stride[0], 1, 1), + padding=(self.padding[0], 0, 0), + bias=False, + groups=dim_out, + ) + + self.bn = norm_module(num_features=dim_out, + eps=self.eps, momentum=self.bn_mmt) + self.relu = nn.ReLU(self.inplace_relu) + + def forward(self, x): + x = self.conv_xy(x) + x = self.conv(x) + x = self.bn(x) + x = self.relu(x) + return x + + +class PatchEmbed(nn.Module): + """ + PatchEmbed. + """ + + def __init__( + self, + dim_in=3, + dim_out=768, + kernel=(1, 16, 16), + stride=(1, 4, 4), + padding=(1, 7, 7), + conv_2d=False, + ): + super().__init__() + if conv_2d: + conv = nn.Conv2d + else: + conv = nn.Conv3d + self.proj = conv( + dim_in, + dim_out, + kernel_size=kernel, + stride=stride, + padding=padding, + ) + + def forward(self, x, keep_spatial=False): + x = self.proj(x) + if keep_spatial: + return x, x.shape + # B C (T) H W -> B (T)HW C + return x.flatten(2).transpose(1, 2), x.shape diff --git a/src/kabr_tools/utils/slowfast/utils.py b/src/kabr_tools/utils/slowfast/utils.py new file mode 100644 index 0000000..ec3408d --- /dev/null +++ b/src/kabr_tools/utils/slowfast/utils.py @@ -0,0 +1,115 @@ +# Copyright (c) Facebook, Inc. and its affiliates. All Rights Reserved. +# https://github.com/facebookresearch/SlowFast/blob/bac7b672f40d44166a84e8c51d1a5ba367ace816/slowfast/visualization/ava_demo_precomputed_boxes.py + +import math +import cv2 +import torch +import numpy as np +from torch import Tensor + + +def get_sequence(center_idx, half_len, sample_rate, num_frames): + seq = list(range(center_idx - half_len, center_idx + half_len, sample_rate)) + + for seq_idx in range(len(seq)): + if seq[seq_idx] < 0: + seq[seq_idx] = 0 + elif seq[seq_idx] >= num_frames: + seq[seq_idx] = num_frames - 1 + return seq + + +def scale(size, image): + height = image.shape[0] + width = image.shape[1] + if (width <= height and width == size) or (height <= width and height == size): + return image + new_width = size + new_height = size + if width < height: + new_height = int(math.floor((float(height) / width) * size)) + else: + new_width = int(math.floor((float(width) / height) * size)) + img = cv2.resize(image, (new_width, new_height), + interpolation=cv2.INTER_LINEAR) + return img.astype(np.float32) + + +def process_cv2_inputs(frames, cfg): + inputs = torch.from_numpy(np.array(frames)).float() / 255 + inputs = tensor_normalize(inputs, cfg.DATA.MEAN, cfg.DATA.STD) + # T H W C -> C T H W. + inputs = inputs.permute(3, 0, 1, 2) + # Sample frames for num_frames specified. + index = torch.linspace(0, inputs.shape[1] - 1, cfg.DATA.NUM_FRAMES).long() + inputs = torch.index_select(inputs, 1, index) + inputs = pack_pathway_output(cfg, inputs) + inputs = [inp.unsqueeze(0) for inp in inputs] + return inputs + + +def tensor_normalize(tensor, mean, std, func=None): + if tensor.dtype == torch.uint8: + tensor = tensor.float() + tensor = tensor / 255.0 + if type(mean) == list: + mean = torch.tensor(mean) + if type(std) == list: + std = torch.tensor(std) + if func is not None: + tensor = func(tensor) + tensor = tensor - mean + tensor = tensor / std + return tensor + + +def pack_pathway_output(cfg, frames): + if cfg.DATA.REVERSE_INPUT_CHANNEL: + frames = frames[[2, 1, 0], :, :, :] + if cfg.MODEL.ARCH in cfg.MODEL.SINGLE_PATHWAY_ARCH: + frame_list = [frames] + elif cfg.MODEL.ARCH in cfg.MODEL.MULTI_PATHWAY_ARCH: + fast_pathway = frames + # Perform temporal sampling from the fast pathway. + slow_pathway = torch.index_select( + frames, + 1, + torch.linspace( + 0, frames.shape[1] - 1, frames.shape[1] // cfg.SLOWFAST.ALPHA + ).long(), + ) + frame_list = [slow_pathway, fast_pathway] + else: + raise NotImplementedError( + "Model arch {} is not in {}".format( + cfg.MODEL.ARCH, + cfg.MODEL.SINGLE_PATHWAY_ARCH + cfg.MODEL.MULTI_PATHWAY_ARCH, + ) + ) + return frame_list + + +def get_input_clip(cap: cv2.VideoCapture, cfg, keyframe_idx: int) -> list[Tensor]: + + seq_length = cfg.DATA.NUM_FRAMES * cfg.DATA.SAMPLING_RATE + total_frames = int(cap.get(cv2.CAP_PROP_FRAME_COUNT)) + seq = get_sequence( + keyframe_idx, + seq_length // 2, + cfg.DATA.SAMPLING_RATE, + total_frames, + ) + clip = [] + for frame_idx in seq: + cap.set(cv2.CAP_PROP_POS_FRAMES, frame_idx) + was_read, frame = cap.read() + if was_read: + frame = cv2.cvtColor(frame, cv2.COLOR_BGR2RGB) + frame = scale(cfg.DATA.TEST_CROP_SIZE, frame) + clip.append(frame) + else: + print("Unable to read frame. Duplicating previous frame.") + clip.append(clip[-1]) + + clip = process_cv2_inputs(clip, cfg) + return clip diff --git a/src/kabr_tools/utils/slowfast/x3d.py b/src/kabr_tools/utils/slowfast/x3d.py new file mode 100644 index 0000000..628f51e --- /dev/null +++ b/src/kabr_tools/utils/slowfast/x3d.py @@ -0,0 +1,352 @@ +# Copyright (c) Facebook, Inc. and its affiliates. All Rights Reserved. + +import math +import torch +from torch import nn +from torch.distributed.algorithms.ddp_comm_hooks import default as comm_hooks_default +from fvcore.nn.weight_init import c2_msra_fill, c2_xavier_fill + +from .norm import get_norm +from .stem import VideoModelStem +from .resnet import ResStage +from .head import X3DHead + +# round width + + +def round_width(width, multiplier, min_width=1, divisor=1): + if not multiplier: + return width + width *= multiplier + min_width = min_width or divisor + width_out = max(min_width, int(width + divisor / 2) // divisor * divisor) + if width_out < 0.9 * width: + width_out += divisor + return int(width_out) + +# init weights + + +def init_weights( + model, fc_init_std=0.01, zero_init_final_bn=True, zero_init_final_conv=False +): + """ + Performs ResNet style weight initialization. + Args: + fc_init_std (float): the expected standard deviation for fc layer. + zero_init_final_bn (bool): if True, zero initialize the final bn for + every bottleneck. + """ + for m in model.modules(): + if isinstance(m, nn.Conv3d): + # Note that there is no bias due to BN + if hasattr(m, "final_conv") and zero_init_final_conv: + m.weight.data.zero_() + else: + """ + Follow the initialization method proposed in: + {He, Kaiming, et al. + "Delving deep into rectifiers: Surpassing human-level + performance on imagenet classification." + arXiv preprint arXiv:1502.01852 (2015)} + """ + c2_msra_fill(m) + + elif isinstance(m, (nn.BatchNorm3d, nn.BatchNorm2d, nn.BatchNorm1d)): + if ( + hasattr(m, "transform_final_bn") + and m.transform_final_bn + and zero_init_final_bn + ): + batchnorm_weight = 0.0 + else: + batchnorm_weight = 1.0 + if m.weight is not None: + m.weight.data.fill_(batchnorm_weight) + if m.bias is not None: + m.bias.data.zero_() + if isinstance(m, nn.Linear): + if hasattr(m, "xavier_init") and m.xavier_init: + c2_xavier_fill(m) + else: + m.weight.data.normal_(mean=0.0, std=fc_init_std) + if m.bias is not None: + m.bias.data.zero_() + + +# pool1 + +_POOL1 = { + "2d": [[1, 1, 1]], + "c2d": [[2, 1, 1]], + "slow_c2d": [[1, 1, 1]], + "i3d": [[2, 1, 1]], + "slow_i3d": [[1, 1, 1]], + "slow": [[1, 1, 1]], + "slowfast": [[1, 1, 1], [1, 1, 1]], + "x3d": [[1, 1, 1]], +} + +# temporal kernel basis + +_TEMPORAL_KERNEL_BASIS = { + "2d": [ + [[1]], # conv1 temporal kernel. + [[1]], # res2 temporal kernel. + [[1]], # res3 temporal kernel. + [[1]], # res4 temporal kernel. + [[1]], # res5 temporal kernel. + ], + "c2d": [ + [[1]], # conv1 temporal kernel. + [[1]], # res2 temporal kernel. + [[1]], # res3 temporal kernel. + [[1]], # res4 temporal kernel. + [[1]], # res5 temporal kernel. + ], + "slow_c2d": [ + [[1]], # conv1 temporal kernel. + [[1]], # res2 temporal kernel. + [[1]], # res3 temporal kernel. + [[1]], # res4 temporal kernel. + [[1]], # res5 temporal kernel. + ], + "i3d": [ + [[5]], # conv1 temporal kernel. + [[3]], # res2 temporal kernel. + [[3, 1]], # res3 temporal kernel. + [[3, 1]], # res4 temporal kernel. + [[1, 3]], # res5 temporal kernel. + ], + "slow_i3d": [ + [[5]], # conv1 temporal kernel. + [[3]], # res2 temporal kernel. + [[3, 1]], # res3 temporal kernel. + [[3, 1]], # res4 temporal kernel. + [[1, 3]], # res5 temporal kernel. + ], + "slow": [ + [[1]], # conv1 temporal kernel. + [[1]], # res2 temporal kernel. + [[1]], # res3 temporal kernel. + [[3]], # res4 temporal kernel. + [[3]], # res5 temporal kernel. + ], + "slowfast": [ + [[1], [5]], # conv1 temporal kernel for slow and fast pathway. + [[1], [3]], # res2 temporal kernel for slow and fast pathway. + [[1], [3]], # res3 temporal kernel for slow and fast pathway. + [[3], [3]], # res4 temporal kernel for slow and fast pathway. + [[3], [3]], # res5 temporal kernel for slow and fast pathway. + ], + "x3d": [ + [[5]], # conv1 temporal kernels. + [[3]], # res2 temporal kernels. + [[3]], # res3 temporal kernels. + [[3]], # res4 temporal kernels. + [[3]], # res5 temporal kernels. + ], +} + +# model stage depth + +_MODEL_STAGE_DEPTH = {18: (2, 2, 2, 2), 50: (3, 4, 6, 3), 101: (3, 4, 23, 3)} + +# X3D model + + +class X3D(nn.Module): + """ + X3D model builder. It builds a X3D network backbone, which is a ResNet. + + Christoph Feichtenhofer. + "X3D: Expanding Architectures for Efficient Video Recognition." + https://arxiv.org/abs/2004.04730 + """ + + def __init__(self, cfg): + """ + The `__init__` method of any subclass should also contain these + arguments. + + Args: + cfg (CfgNode): model building configs, details are in the + comments of the config file. + """ + super(X3D, self).__init__() + self.norm_module = get_norm(cfg) + self.enable_detection = cfg.DETECTION.ENABLE + self.num_pathways = 1 + + exp_stage = 2.0 + self.dim_c1 = cfg.X3D.DIM_C1 + + self.dim_res2 = ( + round_width(self.dim_c1, exp_stage, divisor=8) + if cfg.X3D.SCALE_RES2 + else self.dim_c1 + ) + self.dim_res3 = round_width(self.dim_res2, exp_stage, divisor=8) + self.dim_res4 = round_width(self.dim_res3, exp_stage, divisor=8) + self.dim_res5 = round_width(self.dim_res4, exp_stage, divisor=8) + + self.block_basis = [ + # blocks, c, stride + [1, self.dim_res2, 2], + [2, self.dim_res3, 2], + [5, self.dim_res4, 2], + [3, self.dim_res5, 2], + ] + self._construct_network(cfg) + init_weights( + self, cfg.MODEL.FC_INIT_STD, cfg.RESNET.ZERO_INIT_FINAL_BN + ) + + def _round_repeats(self, repeats, multiplier): + """Round number of layers based on depth multiplier.""" + if not multiplier: + return repeats + return int(math.ceil(multiplier * repeats)) + + def _construct_network(self, cfg): + """ + Builds a single pathway X3D model. + + Args: + cfg (CfgNode): model building configs, details are in the + comments of the config file. + """ + assert cfg.MODEL.ARCH in _POOL1.keys() + assert cfg.RESNET.DEPTH in _MODEL_STAGE_DEPTH.keys() + + (d2, d3, d4, d5) = _MODEL_STAGE_DEPTH[cfg.RESNET.DEPTH] + + num_groups = cfg.RESNET.NUM_GROUPS + width_per_group = cfg.RESNET.WIDTH_PER_GROUP + dim_inner = num_groups * width_per_group + + w_mul = cfg.X3D.WIDTH_FACTOR + d_mul = cfg.X3D.DEPTH_FACTOR + dim_res1 = round_width(self.dim_c1, w_mul) + + temp_kernel = _TEMPORAL_KERNEL_BASIS[cfg.MODEL.ARCH] + + self.s1 = VideoModelStem( + dim_in=cfg.DATA.INPUT_CHANNEL_NUM, + dim_out=[dim_res1], + kernel=[temp_kernel[0][0] + [3, 3]], + stride=[[1, 2, 2]], + padding=[[temp_kernel[0][0][0] // 2, 1, 1]], + norm_module=self.norm_module, + stem_func_name="x3d_stem", + ) + + # blob_in = s1 + dim_in = dim_res1 + for stage, block in enumerate(self.block_basis): + dim_out = round_width(block[1], w_mul) + dim_inner = int(cfg.X3D.BOTTLENECK_FACTOR * dim_out) + + n_rep = self._round_repeats(block[0], d_mul) + # start w res2 to follow convention + prefix = "s{}".format(stage + 2) + + s = ResStage( + dim_in=[dim_in], + dim_out=[dim_out], + dim_inner=[dim_inner], + temp_kernel_sizes=temp_kernel[1], + stride=[block[2]], + num_blocks=[n_rep], + num_groups=[dim_inner] if cfg.X3D.CHANNELWISE_3x3x3 else [ + num_groups], + num_block_temp_kernel=[n_rep], + nonlocal_inds=cfg.NONLOCAL.LOCATION[0], + nonlocal_group=cfg.NONLOCAL.GROUP[0], + nonlocal_pool=cfg.NONLOCAL.POOL[0], + instantiation=cfg.NONLOCAL.INSTANTIATION, + trans_func_name=cfg.RESNET.TRANS_FUNC, + stride_1x1=cfg.RESNET.STRIDE_1X1, + norm_module=self.norm_module, + dilation=cfg.RESNET.SPATIAL_DILATIONS[stage], + drop_connect_rate=cfg.MODEL.DROPCONNECT_RATE + * (stage + 2) + / (len(self.block_basis) + 1), + ) + dim_in = dim_out + self.add_module(prefix, s) + + if self.enable_detection: + NotImplementedError + else: + spat_sz = int(math.ceil(cfg.DATA.TRAIN_CROP_SIZE / 32.0)) + self.head = X3DHead( + dim_in=dim_out, + dim_inner=dim_inner, + dim_out=cfg.X3D.DIM_C5, + num_classes=cfg.MODEL.NUM_CLASSES, + pool_size=[cfg.DATA.NUM_FRAMES, spat_sz, spat_sz], + dropout_rate=cfg.MODEL.DROPOUT_RATE, + act_func=cfg.MODEL.HEAD_ACT, + bn_lin5_on=cfg.X3D.BN_LIN5, + ) + + def forward(self, x, bboxes=None): + for module in self.children(): + x = module(x) + return x + + +def build_model(cfg, gpu_id=None): + if torch.cuda.is_available(): + assert ( + cfg.NUM_GPUS <= torch.cuda.device_count() + ), "Cannot use more GPU devices than available" + else: + assert ( + cfg.NUM_GPUS == 0 + ), "Cuda is not available. Please set `NUM_GPUS: 0 for running on CPUs." + + # Construct the model + model = X3D(cfg) + + if cfg.BN.NORM_TYPE == "sync_batchnorm_apex": + try: + import apex + except ImportError: + raise ImportError( + "APEX is required for this model, pelase install") + + process_group = apex.parallel.create_syncbn_process_group( + group_size=cfg.BN.NUM_SYNC_DEVICES + ) + model = apex.parallel.convert_syncbn_model( + model, process_group=process_group) + + if cfg.NUM_GPUS: + if gpu_id is None: + # Determine the GPU used by the current process + cur_device = torch.cuda.current_device() + else: + cur_device = gpu_id + # Transfer the model to the current GPU device + model = model.cuda(device=cur_device) + # Use multi-process data parallel model in the multi-gpu setting + if cfg.NUM_GPUS > 1: + # Make model replica operate on the current device + model = torch.nn.parallel.DistributedDataParallel( + module=model, + device_ids=[cur_device], + output_device=cur_device, + find_unused_parameters=( + True + if cfg.MODEL.DETACH_FINAL_FC + or cfg.MODEL.MODEL_NAME == "ContrastiveModel" + else False + ), + ) + if cfg.MODEL.FP16_ALLREDUCE: + model.register_comm_hook( + state=None, hook=comm_hooks_default.fp16_compress_hook + ) + return model From ecbf84e32cfb5980f845d235876f1adad07bb467 Mon Sep 17 00:00:00 2001 From: zhong-al <74470739+zhong-al@users.noreply.github.com> Date: Mon, 25 Nov 2024 17:14:58 -0500 Subject: [PATCH 18/20] Add slowfast option --- src/kabr_tools/miniscene2behavior.py | 102 ++++++++++++++++----------- 1 file changed, 59 insertions(+), 43 deletions(-) diff --git a/src/kabr_tools/miniscene2behavior.py b/src/kabr_tools/miniscene2behavior.py index 04bfabd..3ad253e 100644 --- a/src/kabr_tools/miniscene2behavior.py +++ b/src/kabr_tools/miniscene2behavior.py @@ -1,48 +1,15 @@ import sys import argparse +import random import torch from lxml import etree import numpy as np import pandas as pd import cv2 from tqdm import tqdm -import slowfast.utils.checkpoint as cu -from slowfast.models import build -from slowfast.utils import parser -from slowfast.datasets.utils import get_sequence -from slowfast.visualization.utils import process_cv2_inputs -from slowfast.datasets.cv2_transform import scale -from fvcore.common.config import CfgNode -from torch import Tensor - - -def get_input_clip(cap: cv2.VideoCapture, cfg: CfgNode, keyframe_idx: int) -> list[Tensor]: - # https://github.com/facebookresearch/SlowFast/blob/bac7b672f40d44166a84e8c51d1a5ba367ace816/slowfast/visualization/ava_demo_precomputed_boxes.py - seq_length = cfg.DATA.NUM_FRAMES * cfg.DATA.SAMPLING_RATE - total_frames = int(cap.get(cv2.CAP_PROP_FRAME_COUNT)) - assert keyframe_idx < total_frames, f"keyframe_idx: {keyframe_idx}" \ - f" >= total_frames: {total_frames}" - seq = get_sequence( - keyframe_idx, - seq_length // 2, - cfg.DATA.SAMPLING_RATE, - total_frames, - ) - - clip = [] - for frame_idx in seq: - cap.set(cv2.CAP_PROP_POS_FRAMES, frame_idx) - was_read, frame = cap.read() - if was_read: - frame = cv2.cvtColor(frame, cv2.COLOR_BGR2RGB) - frame = scale(cfg.DATA.TEST_CROP_SIZE, frame) - clip.append(frame) - else: - print("Unable to read frame. Duplicating previous frame.") - clip.append(clip[-1]) - - clip = process_cv2_inputs(clip, cfg) - return clip +from kabr_tools.utils.slowfast.utils import get_input_clip +from kabr_tools.utils.slowfast.cfg import load_config, CfgNode +from kabr_tools.utils.slowfast.x3d import build_model def parse_args() -> argparse.Namespace: @@ -83,29 +50,43 @@ def parse_args() -> argparse.Namespace: help="filepath for output csv", default="annotation_data.csv" ) + local_parser.add_argument( + "--slowfast", + action="store_true", + help="load slowfast model" + ) return local_parser.parse_args() -def create_model(config_path: str, checkpoint_path: str, gpu_num: int) -> tuple[CfgNode, torch.nn.Module]: +def set_seeds(seed): + np.random.seed(seed) + random.seed(seed) + torch.manual_seed(seed) + + +def create_slowfast(config_path: str, checkpoint_path: str, gpu_num: int) -> tuple[CfgNode, torch.nn.Module]: + import slowfast.utils.checkpoint as cu + from slowfast.models import build + from slowfast.utils import parser + # load model config try: cfg = parser.load_config(parser.parse_args(), config_path) except FileNotFoundError: checkpoint = torch.load( checkpoint_path, map_location=torch.device("cpu")) - with open(config_path, "w") as file: + with open(config_path, "w", encoding="utf-8") as file: file.write(checkpoint["cfg"]) cfg = parser.load_config(parser.parse_args(), config_path) cfg.NUM_GPUS = gpu_num cfg.OUTPUT_DIR = "" - model = build.build_model(cfg) # set random seeds - np.random.seed(cfg.RNG_SEED) - torch.manual_seed(cfg.RNG_SEED) + set_seeds(cfg.RNG_SEED) # load model checkpoint + model = build.build_model(cfg) cu.load_checkpoint(checkpoint_path, model, data_parallel=False) # set model to eval mode @@ -113,6 +94,33 @@ def create_model(config_path: str, checkpoint_path: str, gpu_num: int) -> tuple[ return cfg, model +def create_model(config_path: str, checkpoint_path: str, gpu_num: int) -> tuple[CfgNode, torch.nn.Module]: + # load model checkpoint + checkpoint = torch.load(checkpoint_path, weights_only=True, + map_location=torch.device("cpu")) + + # load model config + try: + cfg = load_config(config_path) + except FileNotFoundError: + with open(config_path, "w", encoding="utf-8") as file: + file.write(checkpoint["cfg"]) + cfg = load_config(config_path) + cfg.NUM_GPUS = gpu_num + cfg.OUTPUT_DIR = "" + + # set random seeds + set_seeds(cfg.RNG_SEED) + + # load model + model = build_model(cfg) + model.load_state_dict(checkpoint["model_state"]) + + # set model to eval mode + model.eval() + return cfg, model + + def annotate_miniscene(cfg: CfgNode, model: torch.nn.Module, miniscene_path: str, video: str, output_path: str) -> None: @@ -174,7 +182,15 @@ def main() -> None: # clear arguments to avoid slowfast parsing issues args = parse_args() sys.argv = [sys.argv[0]] - cfg, model = create_model(args.config, args.checkpoint, args.gpu_num) + + # load model + if not args.slowfast: + cfg, model = create_model(args.config, args.checkpoint, args.gpu_num) + else: + cfg, model = create_slowfast( + args.config, args.checkpoint, args.gpu_num) + + # annotate annotate_miniscene(cfg, model, args.miniscene, args.video, args.output) From b55b7adabc17a7330a692076a4441b1190c7f331 Mon Sep 17 00:00:00 2001 From: zhong-al <74470739+zhong-al@users.noreply.github.com> Date: Mon, 25 Nov 2024 17:20:17 -0500 Subject: [PATCH 19/20] Make detectron2 and slowfast optional --- pyproject.toml | 10 +++++++--- 1 file changed, 7 insertions(+), 3 deletions(-) diff --git a/pyproject.toml b/pyproject.toml index 6fb4d73..4d4a08a 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -35,9 +35,7 @@ dependencies = [ "ruamel.yaml", "ultralytics", "pandas", - "detectron2 @ git+https://github.com/facebookresearch/detectron2.git@2a420edb307c9bdf640f036d3b196bed474b8593", - "pytorchvideo @ git+https://github.com/facebookresearch/pytorchvideo.git@1fadaef40dd393ca09680f55582399f4679fc9b7", - "slowfast @ git+https://github.com/Imageomics/SlowFast@797a6f3ae81c49019d006296f1e0f84f431dc356" + "pytorchvideo @ git+https://github.com/facebookresearch/pytorchvideo.git@1fadaef40dd393ca09680f55582399f4679fc9b7" ] keywords = [ "annotation", @@ -68,3 +66,9 @@ miniscene2behavior = "kabr_tools.miniscene2behavior:main" [tool.hatch.version] path = "src/kabr_tools/__about__.py" + +[project.optional-dependencies] +slowfast = [ + "detectron2 @ git+https://github.com/facebookresearch/detectron2.git@2a420edb307c9bdf640f036d3b196bed474b8593", + "slowfast @ git+https://github.com/Imageomics/SlowFast@797a6f3ae81c49019d006296f1e0f84f431dc356" +] From 18129d2b90ce15c4ea15161cc7b2cef922a08a94 Mon Sep 17 00:00:00 2001 From: zhong-al <74470739+zhong-al@users.noreply.github.com> Date: Tue, 10 Dec 2024 17:47:55 -0500 Subject: [PATCH 20/20] Update patch --- tests/test_miniscene2behavior.py | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/tests/test_miniscene2behavior.py b/tests/test_miniscene2behavior.py index 7875e2d..2f650cf 100644 --- a/tests/test_miniscene2behavior.py +++ b/tests/test_miniscene2behavior.py @@ -94,9 +94,9 @@ def test_run(self): "--video", self.video] run() - @patch('kabr_tools.miniscene2behavior.process_cv2_inputs') + @patch('kabr_tools.miniscene2behavior.get_input_clip') @patch('kabr_tools.miniscene2behavior.cv2.VideoCapture') - def test_matching_tracks(self, video_capture, process_cv2_inputs): + def test_matching_tracks(self, video_capture, get_input_clip): # Create fake model that always returns a prediction of 1 mock_model = Mock() @@ -131,9 +131,9 @@ def test_matching_tracks(self, video_capture, process_cv2_inputs): "video", "track", "frame", "label"]) self.assertGreater(len(df.index), 0) - @patch('kabr_tools.miniscene2behavior.process_cv2_inputs') + @patch('kabr_tools.miniscene2behavior.get_input_clip') @patch('kabr_tools.miniscene2behavior.cv2.VideoCapture') - def test_nonmatching_tracks(self, video_capture, process_cv2_inputs): + def test_nonmatching_tracks(self, video_capture, get_input_clip): # Create fake model that always returns a prediction of 1 mock_model = Mock()