diff --git a/.gitignore b/.gitignore index e7b5b24..521b2e5 100644 --- a/.gitignore +++ b/.gitignore @@ -1,3 +1,5 @@ -.venv +.venv config.py __pycache__ +.DS_Store +.vscode \ No newline at end of file diff --git a/config_template.py b/config_template.py index 1cb4bd8..c6c34b0 100644 --- a/config_template.py +++ b/config_template.py @@ -8,10 +8,16 @@ # Date range (inclusive) for downloads, None value for Days gets replaced by first/last day of the month. START_DAY, START_MONTH, START_YEAR = None, 5, 2020 -END_DAY, END_MONTH, END_YEAR = None , 3, 2022 +END_DAY, END_MONTH, END_YEAR = None, 3, 2022 # Put here emails of the users you want to check for recordings. If empty, all users under the account will be checked. -USERS = [ +USERS_INCLUDE = [ + # R"####@####.####", + # R"####@####.####", +] + +# Put here emails of the users you want to exclude from checking for recordings. Optional. +USERS_EXCLUDE = [ # R"####@####.####", # R"####@####.####", ] @@ -22,6 +28,9 @@ # R"############", ] +# If True, topics that partially match your topic filters are downloaded. If False, only meetings with exact topic matches are downloaded. +PARTIAL_MATCH_TOPICS = False + # Put here the file types you wish to download. If empty, no file type filtering will happen. RECORDING_FILE_TYPES = [ # R"MP4", # Video file of the recording. @@ -32,11 +41,13 @@ # R"SUMMARY", # Summary file of the recording in JSON file format. ] -# If True, recordings will be grouped in folders by their owning user. -GROUP_BY_USER = True - -# If True, recordings will be grouped in folders by their topics -GROUP_BY_TOPIC = True +# Group records in a folder hierarchy using the order below. +# Reorder or comment out any of the folder groups below to control the folder hierarchy created to orgainze the downloaded recording files. +GROUP_FOLDERS_BY = [ + # R"YEAR_MONTH", # Recordings will be grouped in folders by their recording start date in yyyy-mm format. + R"USER_EMAIL", # Recordings will be grouped in folders by their owning user's email address. + R"TOPIC", # Recordings will be grouped in folders by their topics. +] # If True, each instance of recording will be in its own folder (which may contain multiple files). # Note: One "meeting" can have multiple recording instances. @@ -46,6 +57,18 @@ # This works when "Record a separate audio file of each participant" is enabled. INCLUDE_PARTICIPANT_AUDIO = True +# Recording file name format to use when saving files. Reorder or comment out any file name format pieces below to control the file naming pattern. +# Example: 2023-12-25t143021z__name-of-the-meeting__audio_transcript__ff625374.VTT +FILE_NAME_FORMAT = [ + R"RECORDING_START_DATETIME", # Recording start datetime + R"RECORDING_NAME", # Recording name + R"RECORDING_TYPE", # Recoding type + R"FILE_ID", # Recording unique file ID +] + +# Seperator character(s) to place in between the file name format pieces when building the recording file names. +FILE_NAME_SEPERATOR = "__" + # Set to True for more verbose output VERBOSE_OUTPUT = False diff --git a/zoom_batch_downloader.py b/zoom_batch_downloader.py index a9a8c99..62c5f76 100644 --- a/zoom_batch_downloader.py +++ b/zoom_batch_downloader.py @@ -17,6 +17,9 @@ def main(): print_filter_warnings() + if CONFIG.PARTIAL_MATCH_TOPICS: + CONFIG.TOPICS = [topic.lower() for topic in CONFIG.TOPICS] + from_date = datetime.datetime(CONFIG.START_YEAR, CONFIG.START_MONTH, CONFIG.START_DAY or 1) to_date = datetime.datetime( CONFIG.END_YEAR, CONFIG.END_MONTH, CONFIG.END_DAY or monthrange(CONFIG.END_YEAR, CONFIG.END_MONTH)[1], @@ -38,8 +41,11 @@ def print_filter_warnings(): if CONFIG.TOPICS: utils.print_bright(f'Topics filter is active {CONFIG.TOPICS}') did_print = True - if CONFIG.USERS: - utils.print_bright(f'Users filter is active {CONFIG.USERS}') + if CONFIG.USERS_INCLUDE: + utils.print_bright(f'Users include filter is active {CONFIG.USERS_INCLUDE}') + did_print = True + if CONFIG.USERS_EXCLUDE: + utils.print_bright(f'Users exclude filter is active {CONFIG.USERS_EXCLUDE}') did_print = True if CONFIG.RECORDING_FILE_TYPES: utils.print_bright(f'Recording file types filter is active {CONFIG.RECORDING_FILE_TYPES}') @@ -49,16 +55,29 @@ def print_filter_warnings(): print() def get_users(): - if CONFIG.USERS: - return [(email, '') for email in CONFIG.USERS] - - return paginate_reduce( - 'https://api.zoom.us/v2/users?status=active', [], - lambda users, page: users + [(user['email'], get_user_name(user)) for user in page['users']] - ) + paginate_reduce( - 'https://api.zoom.us/v2/users?status=inactive', [], - lambda users, page: users + [(user['email'], get_user_name(user)) for user in page['users']] - ) + if CONFIG.USERS_INCLUDE: + users = [(email, '') for email in CONFIG.USERS_INCLUDE] + else: + users = paginate_reduce( + 'https://api.zoom.us/v2/users?status=active', [], + lambda users, page: users + [(user['email'], get_user_name(user)) for user in page['users']] + ) + paginate_reduce( + 'https://api.zoom.us/v2/users?status=inactive', [], + lambda users, page: users + [(user['email'], get_user_name(user)) for user in page['users']] + ) + + if CONFIG.VERBOSE_OUTPUT: + utils.print_dim('Found matching users:') + + for user_email, user_name in users: + if user_email in CONFIG.USERS_EXCLUDE: + users.pop(users.index((user_email, user_name))) + continue + + if CONFIG.VERBOSE_OUTPUT: + utils.print_dim(f'{user_name} <{user_email}>') + + return users def paginate_reduce(url, initial, reduce): initial_url = utils.add_url_params(url, {'page_size': 300}) @@ -125,7 +144,7 @@ def download_recordings(users, from_date, to_date): for user_email, user_name in users: user_description = get_user_description(user_email, user_name) - user_host_folder = get_user_host_folder(user_email) + host_folder = CONFIG.OUTPUT_PATH utils.print_bright( f'Downloading recordings from user {user_description} - Starting at {date_to_str(from_date)} ' @@ -133,7 +152,7 @@ def download_recordings(users, from_date, to_date): ) meetings = get_meetings(get_meeting_uuids(user_email, from_date, to_date)) - user_file_count, user_total_size, user_skipped_count = download_recordings_from_meetings(meetings, user_host_folder) + user_file_count, user_total_size, user_skipped_count = download_recordings_from_meetings(meetings, host_folder, user_email) utils.print_bright('######################################################################') print() @@ -147,12 +166,6 @@ def download_recordings(users, from_date, to_date): def get_user_description(user_email, user_name): return f'{user_email} ({user_name})' if (user_name) else user_email -def get_user_host_folder(user_email): - if CONFIG.GROUP_BY_USER: - return os.path.join(CONFIG.OUTPUT_PATH, user_email) - else: - return CONFIG.OUTPUT_PATH - def date_to_str(date): return date.strftime('%Y-%m-%d') @@ -180,6 +193,8 @@ def get_meeting_uuids(user_email, start_date, end_date): local_start_date = local_end_date + datetime.timedelta(days=1) progress_bar.update(1) + utils.print_dim(f"Meetings found: {len(meeting_uuids)}") + return meeting_uuids def get_meetings(meeting_uuids): @@ -189,14 +204,23 @@ def get_meetings(meeting_uuids): url = f'https://api.zoom.us/v2/meetings/{utils.double_encode(meeting_uuid)}/recordings' meetings.append(get_with_token(lambda t: requests.get(url=url, headers=get_headers(t))).json()) + utils.print_dim(f"Recordings found: {len(meetings)}") + return meetings -def download_recordings_from_meetings(meetings, host_folder): +def download_recordings_from_meetings(meetings, host_folder, user_email): file_count, total_size, skipped_count = 0, 0, 0 for meeting in meetings: - if CONFIG.TOPICS and meeting['topic'] not in CONFIG.TOPICS and utils.slugify(meeting['topic']) not in CONFIG.TOPICS: - continue + if CONFIG.TOPICS and meeting['topic']: + if CONFIG.PARTIAL_MATCH_TOPICS: + topic_lower = str.lower(meeting['topic']) + topic_lower_slug = utils.slugify(meeting['topic']) + if not any(topic in topic_lower for topic in CONFIG.TOPICS) and not any(topic in topic_lower_slug for topic in CONFIG.TOPICS): + continue + else: + if meeting['topic'] not in CONFIG.TOPICS and utils.slugify(meeting['topic']) not in CONFIG.TOPICS: + continue recording_files = meeting.get('recording_files') or [] participant_audio_files = meeting.get('participant_audio_files') or [] if CONFIG.INCLUDE_PARTICIPANT_AUDIO else [] @@ -210,17 +234,12 @@ def download_recordings_from_meetings(meetings, host_folder): url = recording_file['download_url'] topic = utils.slugify(meeting['topic']) - ext = recording_file.get('file_extension') or os.path.splitext(recording_file['file_name'])[1] - recording_name = utils.slugify(f'{topic}__{recording_file["recording_start"]}') - file_id = recording_file['id'] - file_name_suffix = os.path.splitext(recording_file['file_name'])[0] + '__' if 'file_name' in recording_file else '' - recording_type_suffix = recording_file["recording_type"] + '__' if 'recording_type' in recording_file else '' - file_name = utils.slugify( - f'{recording_name}__{recording_type_suffix}{file_name_suffix}{file_id[-8:]}' - ) + '.' + ext + recording_name = utils.slugify(f'{topic}') + + file_name = build_file_name(recording_file, topic) file_size = int(recording_file.get('file_size')) - if download_recording_file(url, host_folder, file_name, file_size, topic, recording_name): + if download_recording_file(url, host_folder, file_name, file_size, topic, recording_name, recording_file["recording_start"], user_email): total_size += file_size file_count += 1 else: @@ -228,12 +247,39 @@ def download_recordings_from_meetings(meetings, host_folder): return file_count, total_size, skipped_count -def download_recording_file(download_url, host_folder, file_name, file_size, topic, recording_name): +def build_file_name(recording_file, topic): + recording_name = utils.slugify(f'{topic}') + recording_start = utils.slugify(f'{recording_file["recording_start"]}') + file_id = recording_file['id'][-8:] + file_name_suffix = os.path.splitext(recording_file['file_name'])[0] + '__' if 'file_name' in recording_file else '' + recording_type_suffix = '' + + recording_type_suffix = recording_file["recording_type"] if 'recording_type' in recording_file else '' + file_extension = recording_file.get('file_extension') or os.path.splitext(recording_file['file_name'])[1] + + file_name_pieces = [] + for format in CONFIG.FILE_NAME_FORMAT: + if format == "RECORDING_START_DATETIME": + file_name_pieces.append(f'{recording_start}') + if format == "RECORDING_NAME": + file_name_pieces.append(f'{recording_name}{file_name_suffix}') + if format == "RECORDING_TYPE": + file_name_pieces.append(f'{recording_type_suffix}') + if format == "FILE_ID": + file_name_pieces.append(f'{file_id}') + + file_name = utils.slugify(f'{CONFIG.FILE_NAME_SEPERATOR}'.join(file_name_pieces)) + '.' + file_extension + + return file_name + +def download_recording_file(download_url, host_folder, file_name, file_size, topic, recording_name, recording_start, user_email): + folder_path = create_folder_path(host_folder, topic, recording_name, recording_start, user_email) + file_path = os.path.join(folder_path, file_name) + if CONFIG.VERBOSE_OUTPUT: print() utils.print_dim(f'URL: {download_url}') - - file_path = create_path(host_folder, file_name, topic, recording_name) + utils.print_dim(f'Folder: {folder_path}') if os.path.exists(file_path) and abs(os.path.getsize(file_path) - file_size) <= CONFIG.FILE_SIZE_MISMATCH_TOLERANCE: utils.print_dim(f'Skipping existing file: {file_name}') @@ -257,16 +303,25 @@ def download_recording_file(download_url, host_folder, file_name, file_size, top return True -def create_path(host_folder, file_name, topic, recording_name): +def create_folder_path(host_folder, topic, recording_name, recording_start, user_email): folder_path = host_folder - if CONFIG.GROUP_BY_TOPIC: - folder_path = os.path.join(folder_path, topic) + for group_by in CONFIG.GROUP_FOLDERS_BY: + if group_by == "YEAR_MONTH": + recording_start_date = datetime.datetime.strptime(recording_start, '%Y-%m-%dT%H:%M:%SZ') + year_month = recording_start_date.strftime('%Y-%m') + folder_path = os.path.join(folder_path, year_month) + if group_by == "USER_EMAIL": + folder_path = os.path.join(folder_path, user_email) + if group_by == "TOPIC": + folder_path = os.path.join(folder_path, topic) + if CONFIG.GROUP_BY_RECORDING: folder_path = os.path.join(folder_path, recording_name) os.makedirs(folder_path, exist_ok=True) - return os.path.join(folder_path, file_name) + + return folder_path def do_with_token(do): def do_as_get(token):